diff --git a/.gitignore b/.gitignore
index df8b21794..01886f168 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,9 @@ flower_data
 *.xml
 *.bin
 *.mapping
+*.csv
 checkpoint
 data
 VOCdevkit
 ssd_resnet50_v1_fpn_shared_box_predictor
+runs
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..f288702d2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/README.md b/README.md
index 7128d5066..4fb186d53 100644
--- a/README.md
+++ b/README.md
@@ -40,56 +40,74 @@
     * [ResNeXt网络讲解](https://www.bilibili.com/video/BV1Ap4y1p71v/)
     * [Pytorch搭建ResNeXt网络](https://www.bilibili.com/video/BV1rX4y1N7tE)
 
-  * MobileNet_v1_v2（已完成）
-    * [MobileNet_v1_v2网络讲解](https://www.bilibili.com/video/BV1yE411p7L7)
+  * MobileNet_V1_V2（已完成）
+    * [MobileNet_V1_V2网络讲解](https://www.bilibili.com/video/BV1yE411p7L7)
     * [Pytorch搭建MobileNetV2网络](https://www.bilibili.com/video/BV1qE411T7qZ)
     * [Tensorflow2搭建MobileNetV2网络](https://www.bilibili.com/video/BV1NE411K7tX)
 
-  * MobileNet_v3（已完成）
-    * [MobileNet_v3网络讲解](https://www.bilibili.com/video/BV1GK4y1p7uE)
+  * MobileNet_V3（已完成）
+    * [MobileNet_V3网络讲解](https://www.bilibili.com/video/BV1GK4y1p7uE)
     * [Pytorch搭建MobileNetV3网络](https://www.bilibili.com/video/BV1zT4y1P7pd)
     * [Tensorflow2搭建MobileNetV3网络](https://www.bilibili.com/video/BV1KA411g7wX)
 
-  * ShuffleNet_v1_v2 (已完成)
-    * [ShuffleNet_v1_v2网络讲解](https://www.bilibili.com/video/BV15y4y1Y7SY)
-    * [使用Pytorch搭建ShuffleNetv2](https://www.bilibili.com/video/BV1dh411r76X)
-    * [使用Tensorflow2搭建ShuffleNetv2](https://www.bilibili.com/video/BV1kr4y1N7bh)
+  * ShuffleNet_V1_V2 (已完成)
+    * [ShuffleNet_V1_V2网络讲解](https://www.bilibili.com/video/BV15y4y1Y7SY)
+    * [使用Pytorch搭建ShuffleNetV2](https://www.bilibili.com/video/BV1dh411r76X)
+    * [使用Tensorflow2搭建ShuffleNetV2](https://www.bilibili.com/video/BV1kr4y1N7bh)
 
-  * EfficientNet_v1（已完成）
+  * EfficientNet_V1（已完成）
     * [EfficientNet网络讲解](https://www.bilibili.com/video/BV1XK4y1U7PX)
     * [使用Pytorch搭建EfficientNet](https://www.bilibili.com/video/BV19z4y1179h/)
     * [使用Tensorflow2搭建EfficientNet](https://www.bilibili.com/video/BV1PK4y1S7Jf)
 
-  * EfficientNet_v2 (已完成)
-    * [EfficientNetV2网络讲解](https://b23.tv/NDR7Ug)
-    * [使用Pytorch搭建EfficientNetV2](https://b23.tv/M4hagB)
-    * [使用Tensorflow搭建EfficientNetV2](https://b23.tv/KUPbdr)
+  * EfficientNet_V2 (已完成)
+    * [EfficientNetV2网络讲解](https://www.bilibili.com/video/BV19v41157AU)
+    * [使用Pytorch搭建EfficientNetV2](https://www.bilibili.com/video/BV1Xy4y1g74u)
+    * [使用Tensorflow搭建EfficientNetV2](https://www.bilibili.com/video/BV19K4y1g7m4)
+  
+  * RepVGG（已完成）
+    * [RepVGG网络讲解](https://www.bilibili.com/video/BV15f4y1o7QR)
 
   * Vision Transformer(已完成)
-    * [Multi-Head Attention讲解](https://b23.tv/gucpvt)
+    * [Multi-Head Attention讲解](https://www.bilibili.com/video/BV15v411W78M)
     * [Vision Transformer网络讲解](https://www.bilibili.com/video/BV1Jh411Y7WQ)
-    * [使用Pytorch搭建Vision Transformer](https://b23.tv/TT4VBM)
+    * [使用Pytorch搭建Vision Transformer](https://www.bilibili.com/video/BV1AL411W7dT)
     * [使用tensorflow2搭建Vision Transformer](https://www.bilibili.com/video/BV1q64y1X7GY)
 
   * Swin Transformer(已完成)
     * [Swin Transformer网络讲解](https://www.bilibili.com/video/BV1pL4y1v7jC)
-    * [使用Pytorch搭建Swin Transformer](https://b23.tv/vZnpJf)
-    * [使用Tensorflow2搭建Swin Transformer](https://b23.tv/UHLMSF)
+    * [使用Pytorch搭建Swin Transformer](https://www.bilibili.com/video/BV1yg411K7Yc)
+    * [使用Tensorflow2搭建Swin Transformer](https://www.bilibili.com/video/BV1bR4y1t7qT)
+
+  * ConvNeXt(已完成)
+    * [ConvNeXt网络讲解](https://www.bilibili.com/video/BV1SS4y157fu)
+    * [使用Pytorch搭建ConvNeXt](https://www.bilibili.com/video/BV14S4y1L791)
+    * [使用Tensorflow2搭建ConvNeXt](https://www.bilibili.com/video/BV1TS4y1V7Gz)
+
+  * MobileViT(已完成)
+    * [MobileViT网络讲解](https://www.bilibili.com/video/BV1TG41137sb)
+    * [使用Pytorch搭建MobileViT](https://www.bilibili.com/video/BV1ae411L7Ki)
 
 * 目标检测
   * Faster-RCNN/FPN（已完成）
     * [Faster-RCNN网络讲解](https://www.bilibili.com/video/BV1af4y1m7iL)
-    * [FPN网络讲解](https://b23.tv/Qhn6xA)
+    * [FPN网络讲解](https://www.bilibili.com/video/BV1dh411U7D9)
     * [Faster-RCNN源码解析(Pytorch)](https://www.bilibili.com/video/BV1of4y1m7nj)
 
   * SSD/RetinaNet (已完成)
     * [SSD网络讲解](https://www.bilibili.com/video/BV1fT4y1L7Gi)
-    * [RetinaNet网络讲解](https://b23.tv/ZYCfd2)
+    * [RetinaNet网络讲解](https://www.bilibili.com/video/BV1Q54y1L7sM)
     * [SSD源码解析(Pytorch)](https://www.bilibili.com/video/BV1vK411H771)
 
-  * YOLOv3 SPP (已完成)
-    * [YOLO系列网络讲解](https://www.bilibili.com/video/BV1yi4y1g7ro)
+  * YOLO Series (已完成)
+    * [YOLO系列网络讲解(V1~V3)](https://www.bilibili.com/video/BV1yi4y1g7ro)
     * [YOLOv3 SPP源码解析(Pytorch版)](https://www.bilibili.com/video/BV1t54y1C7ra)
+    * [YOLOV4网络讲解](https://www.bilibili.com/video/BV1NF41147So)
+    * [YOLOV5网络讲解](https://www.bilibili.com/video/BV1T3411p7zR)
+    * [YOLOX 网络讲解](https://www.bilibili.com/video/BV1JW4y1k76c)
+  
+  * FCOS（已完成）
+    * [FCOS网络讲解](https://www.bilibili.com/video/BV1G5411X7jw)
 
 * 语义分割 
   * FCN (已完成)
@@ -98,7 +116,7 @@
 
   * DeepLabV3 (已完成)
     * [DeepLabV1网络讲解](https://www.bilibili.com/video/BV1SU4y1N7Ao)
-	* [DeepLabV2网络讲解](https://www.bilibili.com/video/BV1gP4y1G7TC)
+    * [DeepLabV2网络讲解](https://www.bilibili.com/video/BV1gP4y1G7TC)
     * [DeepLabV3网络讲解](https://www.bilibili.com/video/BV1Jb4y1q7j7)
     * [DeepLabV3源码解析(Pytorch版)](https://www.bilibili.com/video/BV1TD4y1c7Wx)
 
@@ -106,21 +124,32 @@
     * [LR-ASPP网络讲解](https://www.bilibili.com/video/BV1LS4y1M76E)
     * [LR-ASPP源码解析(Pytorch版)](https://www.bilibili.com/video/bv13D4y1F7ML)
   
-  * UNet (准备中)
-    * [UNet网络讲解](https://www.bilibili.com/video/BV1Vq4y127fB/)
+  * U-Net (已完成)
+    * [U-Net网络讲解](https://www.bilibili.com/video/BV1Vq4y127fB/)
+    * [U-Net源码解析(Pytorch版)](https://www.bilibili.com/video/BV1Vq4y127fB)
+  
+  * U2Net (已完成)
+    * [U2Net网络讲解](https://www.bilibili.com/video/BV1yB4y1z7mj)
+    * [U2Net源码解析(Pytorch版)](https://www.bilibili.com/video/BV1Kt4y137iS)
+
+* 实例分割
+  * Mask R-CNN（已完成）
+    * [Mask R-CNN网络讲解](https://www.bilibili.com/video/BV1ZY411774T)
+    * [Mask R-CNN源码解析(Pytorch版)](https://www.bilibili.com/video/BV1hY411E7wD)
+
+* 关键点检测
+  * DeepPose（已完成）
+    * [DeepPose网络讲解](https://www.bilibili.com/video/BV1bm421g7aJ)
+    * [DeepPose源码解析(Pytorch版)](https://www.bilibili.com/video/BV1bm421g7aJ)
+
+  * HRNet（已完成）
+    * [HRNet网络讲解](https://www.bilibili.com/video/BV1bB4y1y7qP)
+    * [HRNet源码解析(Pytorch版)](https://www.bilibili.com/video/BV1ar4y157JM)
 
 **[更多相关视频请进入我的bilibili频道查看](https://space.bilibili.com/18161609/channel/index)**
 
 ---
 
-## 所需环境
-* Anaconda3（建议使用）
-* python3.6/3.7/3.8
-* pycharm (IDE)
-* pytorch 1.7.1 (pip package)
-* torchvision 0.8.1 (pip package)
-* tensorflow 2.4.1 (pip package)
-
 欢迎大家关注下我的微信公众号（**阿喆学习小记**），平时会总结些相关学习博文。    
 
 如果有什么问题，也可以到我的CSDN中一起讨论。
diff --git a/article_link/README.md b/article_link/README.md
index cba6499e4..a1ca1ba6f 100644
--- a/article_link/README.md
+++ b/article_link/README.md
@@ -1,7 +1,5 @@
 # 文献链接
 
------
-
 ## 图像分类(Classification)
 - LeNet [http://yann.lecun.com/exdb/lenet/index.html](http://yann.lecun.com/exdb/lenet/index.html)
 - AlexNet [http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)
@@ -34,7 +32,11 @@
 - Swin Transformer V2: Scaling Up Capacity and Resolution [https://arxiv.org/abs/2111.09883](https://arxiv.org/abs/2111.09883)
 - BEiT: BERT Pre-Training of Image Transformers [https://arxiv.org/abs/2106.08254](https://arxiv.org/abs/2106.08254)
 - MAE(Masked Autoencoders Are Scalable Vision Learners) [https://arxiv.org/abs/2111.06377](https://arxiv.org/abs/2111.06377)
-------
+- ConvNeXt(A ConvNet for the 2020s) [https://arxiv.org/abs/2201.03545](https://arxiv.org/abs/2201.03545)
+- MobileViT V1 [https://arxiv.org/abs/2110.02178](https://arxiv.org/abs/2110.02178)
+- MobileViT V2(Separable Self-attention for Mobile Vision Transformers) [https://arxiv.org/abs/2206.02680](https://arxiv.org/abs/2206.02680)
+- MobileOne(An Improved One millisecond Mobile Backbone) [https://arxiv.org/abs/2206.04040](https://arxiv.org/abs/2206.04040)
+
 
 ## 目标检测(Object Detection)
 - R-CNN [https://arxiv.org/abs/1311.2524](https://arxiv.org/abs/1311.2524)
@@ -51,26 +53,49 @@
 - YOLOv3 [https://arxiv.org/abs/1804.02767](https://arxiv.org/abs/1804.02767)
 - YOLOv4 [https://arxiv.org/abs/2004.10934](https://arxiv.org/abs/2004.10934)
 - YOLOX(Exceeding YOLO Series in 2021) [https://arxiv.org/abs/2107.08430](https://arxiv.org/abs/2107.08430)
+- YOLOv7 [https://arxiv.org/abs/2207.02696](https://arxiv.org/abs/2207.02696)
 - PP-YOLO [https://arxiv.org/abs/2007.12099](https://arxiv.org/abs/2007.12099)
 - PP-YOLOv2 [https://arxiv.org/abs/2104.10419](https://arxiv.org/abs/2104.10419)
 - CornerNet [https://arxiv.org/abs/1808.01244](https://arxiv.org/abs/1808.01244)
-- FCOS [https://arxiv.org/abs/1904.01355](https://arxiv.org/abs/1904.01355)
+- FCOS(Old) [https://arxiv.org/abs/1904.01355](https://arxiv.org/abs/1904.01355)
+- FCOS(New) [https://arxiv.org/abs/2006.09214](https://arxiv.org/abs/2006.09214)
 - CenterNet [https://arxiv.org/abs/1904.07850](https://arxiv.org/abs/1904.07850)
 
 
-## 图像分割(Segmentation)
+## 语义分割(Semantic Segmentation)
 - FCN(Fully Convolutional Networks for Semantic Segmentation) [https://arxiv.org/abs/1411.4038](https://arxiv.org/abs/1411.4038)
 - UNet(U-Net: Convolutional Networks for Biomedical Image Segmentation) [https://arxiv.org/abs/1505.04597](https://arxiv.org/abs/1505.04597)
 - DeepLabv1(Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected CRFs) [https://arxiv.org/abs/1412.7062](https://arxiv.org/abs/1412.7062)
 - DeepLabv2(Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs) [https://arxiv.org/abs/1606.00915](https://arxiv.org/abs/1606.00915)
 - DeepLabv3(Rethinking Atrous Convolution for Semantic Image Segmentation) [https://arxiv.org/abs/1706.05587](https://arxiv.org/abs/1706.05587)
 - DeepLabv3+(Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation) [https://arxiv.org/abs/1802.02611](https://arxiv.org/abs/1802.02611)
+- SegFormer [https://arxiv.org/abs/2105.15203](https://arxiv.org/abs/2105.15203)
+
+
+## 显著性目标检测(Salient Object Detection)
+- U2Net [https://arxiv.org/abs/2005.09007](https://arxiv.org/abs/2005.09007)
+
+
+## 实例分割(Instance Segmentation)
 - Mask R-CNN [https://arxiv.org/abs/1703.06870](https://arxiv.org/abs/1703.06870)
 
 
+## 关键点检测(Keypoint Detection)
+- HRNet(Deep High-Resolution Representation Learning for Human Pose Estimation) [https://arxiv.org/abs/1902.09212](https://arxiv.org/abs/1902.09212)
+
+## 网络量化(Quantization)
+- Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference [https://arxiv.org/abs/1712.05877](https://arxiv.org/abs/1712.05877)
+- Quantizing deep convolutional networks for efficient inference: A whitepaper [https://arxiv.org/abs/1806.08342](https://arxiv.org/abs/1806.08342)
+- Data-Free Quantization Through Weight Equalization and Bias Correction [https://arxiv.org/abs/1906.04721](https://arxiv.org/abs/1906.04721)
+- LSQ: Learned Step Size Quantization [https://arxiv.org/abs/1902.08153](https://arxiv.org/abs/1902.08153)
+- LSQ+: Improving low-bit quantization through learnable offsets and better initialization [https://arxiv.org/abs/2004.09576](https://arxiv.org/abs/2004.09576)
+
+
+
 ## 自然语言处理
 - Attention Is All You Need [https://arxiv.org/abs/1706.03762](https://arxiv.org/abs/1706.03762)
 
 ## Others
 - Microsoft COCO: Common Objects in Context [https://arxiv.org/abs/1405.0312](https://arxiv.org/abs/1405.0312)
 - The PASCALVisual Object Classes Challenge: A Retrospective [http://host.robots.ox.ac.uk/pascal/VOC/pubs/everingham15.pdf](http://host.robots.ox.ac.uk/pascal/VOC/pubs/everingham15.pdf)
+- Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization [https://arxiv.org/abs/1610.02391](https://arxiv.org/abs/1610.02391)
diff --git a/course_ppt/README.md b/course_ppt/README.md
index 98339951c..42b85e94a 100644
--- a/course_ppt/README.md
+++ b/course_ppt/README.md
@@ -1,42 +1,62 @@
 # 为了精简项目，课程中的所有ppt都已转存至百度云
 
+**所有PPT都放在该文件夹中** 链接：https://pan.baidu.com/s/1VL6QTQ86sfY2aMDVo4Z-kg 提取码：4ydw
+
+**下面为单独每个ppt的链接**：
 ## 分类网络相关
-- **AlexNet** 链接: https://pan.baidu.com/s/1RJn5lzY8LwrmckUPvXcjmg  密码: 34ue
-- **VGG** 链接: https://pan.baidu.com/s/1BnYpdaDwAIcgRm7YwakEZw  密码: 8ev0
-- **GoogleNet** 链接: https://pan.baidu.com/s/1XjZXprvayV3dDMvLjoOk3A  密码: 9hq4
-- **ResNet** 链接: https://pan.baidu.com/s/1I2LUlwCSjNKr37T0n3NKzg  密码: f1s9
-- **ResNext** 链接：https://pan.baidu.com/s/1-anFYX5572MJmiQym9D4Eg 密码：f8ob 
-- **MobileNet_v1_v2** 链接: https://pan.baidu.com/s/1ReDDCuK8wyH0XqniUgiSYQ  密码: ipqv
-- **MobileNet_v3**  链接：https://pan.baidu.com/s/13mzSpyxuA4T4ki7kEN1Xqw 密码：fp5g 
-- **ShuffleNet_v1_v2** 链接：https://pan.baidu.com/s/1-DDwePMPCDvjw08YU8nAAA 密码：ad6n
-- **EfficientNet_v1** 链接：https://pan.baidu.com/s/1Sep9W0vLzfjhcHAXr6Bv0Q  密码：eufl 
-- **EfficientNet_v2** 链接：https://pan.baidu.com/s/1tesrgY4CHLmq6P7s7TcHCw  密码：y2kz
-- **Transformer** 链接：https://pan.baidu.com/s/1DE6RDySr7NS0HQ35gBqP_g 密码：y9e7
-- **Vision Transformer** 链接：https://pan.baidu.com/s/1wzpHG8EK5gxg6UCMscYqMw 密码：cm1m
-- **Swin Transformer** 链接：https://pan.baidu.com/s/1O6XEEZUb6B6AGYON7-EOgA 密码：qkrn
-- **ConfusionMatrix** 链接: https://pan.baidu.com/s/1EtKzHkZyv2XssYtqmGYCLg  密码: uoo5
+- **AlexNet** 链接: https://pan.baidu.com/s/1RJn5lzY8LwrmckUPvXcjmg  提取码: 34ue
+- **VGG** 链接: https://pan.baidu.com/s/1BnYpdaDwAIcgRm7YwakEZw  提取码: 8ev0
+- **GoogleNet** 链接: https://pan.baidu.com/s/1XjZXprvayV3dDMvLjoOk3A  提取码: 9hq4
+- **ResNet** 链接: https://pan.baidu.com/s/1I2LUlwCSjNKr37T0n3NKzg  提取码: f1s9
+- **ResNext** 链接：https://pan.baidu.com/s/1-anFYX5572MJmiQym9D4Eg 提取码：f8ob 
+- **MobileNet_v1_v2** 链接: https://pan.baidu.com/s/1ReDDCuK8wyH0XqniUgiSYQ  提取码: ipqv
+- **MobileNet_v3**  链接：https://pan.baidu.com/s/13mzSpyxuA4T4ki7kEN1Xqw 提取码：fp5g 
+- **ShuffleNet_v1_v2** 链接：https://pan.baidu.com/s/1-DDwePMPCDvjw08YU8nAAA 提取码：ad6n
+- **EfficientNet_v1** 链接：https://pan.baidu.com/s/1Sep9W0vLzfjhcHAXr6Bv0Q  提取码：eufl 
+- **EfficientNet_v2** 链接：https://pan.baidu.com/s/1tesrgY4CHLmq6P7s7TcHCw  提取码：y2kz
+- **Transformer** 链接：https://pan.baidu.com/s/1DE6RDySr7NS0HQ35gBqP_g 提取码：y9e7
+- **Vision Transformer** 链接：https://pan.baidu.com/s/1wzpHG8EK5gxg6UCMscYqMw 提取码：cm1m
+- **Swin Transformer** 链接：https://pan.baidu.com/s/1O6XEEZUb6B6AGYON7-EOgA 提取码：qkrn
+- **ConvNeXt** 链接：https://pan.baidu.com/s/1mgZjkirJPZ8huVls-O0xXA  提取码：kvqx
+- **RepVGG** 链接：https://pan.baidu.com/s/1uJP3hCHI79-tUdBNR_VAWQ  提取码：qe8a
+- **MobileViT** 链接：https://pan.baidu.com/s/1F8QJtFhTPWX8Vjr8_97scQ  提取码：lfn5
+- **ConfusionMatrix** 链接: https://pan.baidu.com/s/1EtKzHkZyv2XssYtqmGYCLg  提取码: uoo5
+- **Grad-CAM** 链接：https://pan.baidu.com/s/1ZHKBW7hINQXFI36hBYdC0Q  提取码：aru7
 
 
 ## 目标检测网络相关
-- **R-CNN** 链接: https://pan.baidu.com/s/1l_ZxkfJdyp3KoMLqwWbx5A  密码: nm1l
-- **Fast R-CNN** 链接: https://pan.baidu.com/s/1Pe_Tg43OVo-yZWj7t-_L6Q  密码: fe73
-- **Faster R-CNN** 链接：https://pan.baidu.com/s/1Dd0d_LY8l7Y1YkHQhp-WfA  密码：vzp4
-- **FPN** 链接：https://pan.baidu.com/s/1O9H0iqQMg9f_FZezUEKZ9g 密码：qbl8 
-- **SSD** 链接: https://pan.baidu.com/s/15zF3GhIdg-E_tZX2Y2X-rw  密码: u7k1
-- **RetinaNet**  链接：https://pan.baidu.com/s/1beW612VCSnSu-v8iu_2-fA 密码：vqbu 
-- **YOLOv1** 链接: https://pan.baidu.com/s/1vVyUNQHYEGjqosezlx_1Mg  密码: b3i0
-- **YOLOv2** 链接: https://pan.baidu.com/s/132aW1e_NYbaxxGi3cDVLYg  密码: tak7
-- **YOLOv3** 链接: https://pan.baidu.com/s/10oqZewzJmx5ptT9A4t-64w  密码: npji
-- **YOLOv3SPP** 链接: https://pan.baidu.com/s/15LRssnPez9pn6jRpW89Wlw  密码: nv9f
-- **Calculate mAP** 链接: https://pan.baidu.com/s/1jdA_n78J7nSUoOg6TTO5Bg  密码: eh62
-- **coco数据集简介** 链接：https://pan.baidu.com/s/1HfCvjt-8o9j5a916IYNVjw  密码：6rec 
+- **R-CNN** 链接: https://pan.baidu.com/s/1l_ZxkfJdyp3KoMLqwWbx5A  提取码: nm1l
+- **Fast R-CNN** 链接: https://pan.baidu.com/s/1Pe_Tg43OVo-yZWj7t-_L6Q  提取码: fe73
+- **Faster R-CNN** 链接：https://pan.baidu.com/s/1Dd0d_LY8l7Y1YkHQhp-WfA  提取码：vzp4
+- **FPN** 链接：https://pan.baidu.com/s/1O9H0iqQMg9f_FZezUEKZ9g 提取码：qbl8 
+- **SSD** 链接: https://pan.baidu.com/s/15zF3GhIdg-E_tZX2Y2X-rw  提取码: u7k1
+- **RetinaNet**  链接：https://pan.baidu.com/s/1beW612VCSnSu-v8iu_2-fA 提取码：vqbu 
+- **YOLOv1** 链接: https://pan.baidu.com/s/1vVyUNQHYEGjqosezlx_1Mg  提取码: b3i0
+- **YOLOv2** 链接: https://pan.baidu.com/s/132aW1e_NYbaxxGi3cDVLYg  提取码: tak7
+- **YOLOv3** 链接：https://pan.baidu.com/s/1hZqdgh7wA7QeGAYTttlVOQ  提取码：5ulo
+- **YOLOv3SPP** 链接: https://pan.baidu.com/s/15LRssnPez9pn6jRpW89Wlw  提取码: nv9f
+- **YOLOv4** 链接：https://pan.baidu.com/s/1Ltw4v1pg0eZNFYR2ZBbZmQ  提取码：qjx4
+- **YOLOv5** 链接：https://pan.baidu.com/s/1rnvjwHLvOlJ9KpJ5z95GWw  提取码：kt04
+- **YOLOX** 链接：https://pan.baidu.com/s/1ex54twQC7hBE3szNko_K5A  提取码：al0r
+- **FCOS** 链接: https://pan.baidu.com/s/1KUc9dzvAbtwtGGm3ZZy_cw  提取码: h0as
+- **Calculate mAP** 链接: https://pan.baidu.com/s/1jdA_n78J7nSUoOg6TTO5Bg  提取码: eh62
+- **coco数据集简介** 链接：https://pan.baidu.com/s/1HfCvjt-8o9j5a916IYNVjw  提取码：6rec 
 
 
 ## 图像分割网络相关
-- **语义分割前言** 链接：https://pan.baidu.com/s/1cwxe2wbaA_2DqNYADq3myA 密码：zzij
-- **转置卷积** 链接：https://pan.baidu.com/s/1A8688168fuWHyxJQtzupHw 密码：pgnf
-- **FCN** 链接：https://pan.baidu.com/s/1XLUneTLrdUyDAiV6kqi9rw 密码：126a
-- **膨胀卷积** 链接：https://pan.baidu.com/s/1QlQyniuMhBeXyEK420MIdQ 密码：ry6p
-- **DeepLab V1** 链接：https://pan.baidu.com/s/1NFxb7ADQOMVYLxmIKqTONQ  密码：500s
-- **DeepLab V2** 链接：https://pan.baidu.com/s/1woe3lJYBVkOdnn6XXlKf8g 密码：76ec
-- **DeepLab V3** 链接：https://pan.baidu.com/s/1WVBgc2Ld13D0_dkHGwhTpA 密码：m54m
\ No newline at end of file
+- **语义分割前言** 链接：https://pan.baidu.com/s/1cwxe2wbaA_2DqNYADq3myA 提取码：zzij
+- **转置卷积** 链接：https://pan.baidu.com/s/1A8688168fuWHyxJQtzupHw 提取码：pgnf
+- **FCN** 链接：https://pan.baidu.com/s/1XLUneTLrdUyDAiV6kqi9rw 提取码：126a
+- **膨胀卷积** 链接：https://pan.baidu.com/s/1QlQyniuMhBeXyEK420MIdQ 提取码：ry6p
+- **DeepLab V1** 链接：https://pan.baidu.com/s/1NFxb7ADQOMVYLxmIKqTONQ  提取码：500s
+- **DeepLab V2** 链接：https://pan.baidu.com/s/1woe3lJYBVkOdnn6XXlKf8g 提取码：76ec
+- **DeepLab V3** 链接：https://pan.baidu.com/s/1WVBgc2Ld13D0_dkHGwhTpA 提取码：m54m
+- **U2Net**  链接：https://pan.baidu.com/s/1ekbEm4dsjlFamK8dCs8yfA  提取码：472j
+
+
+## 实例分割
+- **Mask R-CNN** 链接：https://pan.baidu.com/s/1JpQ7ENEv_x9A1-O_NpjwYA 提取码：1t4i
+
+## 关键点检测
+- **HRNet** 链接: https://pan.baidu.com/s/1-8AJdU82K1j70KZK_rN7aQ  提取码: t4me
+
diff --git a/data_set/README.md b/data_set/README.md
index b81800caf..60007a5a5 100644
--- a/data_set/README.md
+++ b/data_set/README.md
@@ -1,7 +1,7 @@
-## 该文件夹是用来存放训练样本的目录
+## 该文件夹是用来存放训练数据的目录
 ### 使用步骤如下：
 * （1）在data_set文件夹下创建新文件夹"flower_data"
-* （2）点击链接下载花分类数据集 [http://download.tensorflow.org/example_images/flower_photos.tgz](http://download.tensorflow.org/example_images/flower_photos.tgz)
+* （2）点击链接下载花分类数据集 [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz)
 * （3）解压数据集到flower_data文件夹下
 * （4）执行"split_data.py"脚本自动将数据集划分成训练集train和验证集val    
 
@@ -10,4 +10,4 @@
        ├── flower_photos（解压的数据集文件夹，3670个样本）  
        ├── train（生成的训练集，3306个样本）  
        └── val（生成的验证集，364个样本） 
-```
\ No newline at end of file
+```
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/README.md b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/README.md
new file mode 100644
index 000000000..0376a4994
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/README.md
@@ -0,0 +1,15 @@
+本项目展示如何将Pytorch中的ResNet34网络转成Openvino的IR格式，并进行量化处理，具体使用流程如下：
+1. 按照`requirements.txt`配置环境
+2. 下载事先训练好的ResNet34权重（之前在花分类数据集上训练得到的）放在当前文件夹下。百度云链接: https://pan.baidu.com/s/1x4WFX1HynYcXLium3UaaFQ  密码: qvi6
+3. 使用`convert_pytorch2onnx.py`将Resnet34转成ONNX格式
+4. 在命令行中使用以下指令将ONNX转成IR格式：
+```
+mo  --input_model resnet34.onnx \
+    --input_shape "[1,3,224,224]" \
+    --mean_values="[123.675,116.28,103.53]" \
+    --scale_values="[58.395,57.12,57.375]" \
+    --data_type FP32 \
+    --output_dir ir_output
+```
+5. 下载并解压花分类数据集，将`quantization_int8.py`中的`data_path`指向解压后的`flower_photos`
+6. 使用`quantization_int8.py`量化模型
\ No newline at end of file
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/compare_fps.py b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/compare_fps.py
new file mode 100644
index 000000000..c74639c25
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/compare_fps.py
@@ -0,0 +1,126 @@
+import time
+import numpy as np
+import torch
+import onnxruntime
+import matplotlib.pyplot as plt
+from openvino.runtime import Core
+from torchvision.models import resnet34
+
+
+def normalize(image: np.ndarray) -> np.ndarray:
+    """
+    Normalize the image to the given mean and standard deviation
+    """
+    image = image.astype(np.float32)
+    mean = (0.485, 0.456, 0.406)
+    std = (0.229, 0.224, 0.225)
+    image /= 255.0
+    image -= mean
+    image /= std
+    return image
+
+
+def onnx_inference(onnx_path: str, image: np.ndarray, num_images: int = 20):
+    # load onnx model
+    ort_session = onnxruntime.InferenceSession(onnx_path)
+
+    # compute onnx Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: image}
+
+    start = time.perf_counter()
+    for _ in range(num_images):
+        ort_session.run(None, ort_inputs)
+    end = time.perf_counter()
+    time_onnx = end - start
+    print(
+        f"ONNX model in Inference Engine/CPU: {time_onnx / num_images:.3f} "
+        f"seconds per image, FPS: {num_images / time_onnx:.2f}"
+    )
+
+    return num_images / time_onnx
+
+
+def ir_inference(ir_path: str, image: np.ndarray, num_images: int = 20):
+    # Load the network in Inference Engine
+    ie = Core()
+    model_ir = ie.read_model(model=ir_path)
+    compiled_model_ir = ie.compile_model(model=model_ir, device_name="CPU")
+
+    # Get input and output layers
+    input_layer_ir = next(iter(compiled_model_ir.inputs))
+    output_layer_ir = next(iter(compiled_model_ir.outputs))
+
+    start = time.perf_counter()
+    request_ir = compiled_model_ir.create_infer_request()
+    for _ in range(num_images):
+        request_ir.infer(inputs={input_layer_ir.any_name: image})
+    end = time.perf_counter()
+    time_ir = end - start
+    print(
+        f"IR model in Inference Engine/CPU: {time_ir / num_images:.3f} "
+        f"seconds per image, FPS: {num_images / time_ir:.2f}"
+    )
+
+    return num_images / time_ir
+
+
+def pytorch_inference(image: np.ndarray, num_images: int = 20):
+    image = torch.as_tensor(image, dtype=torch.float32)
+
+    model = resnet34(pretrained=False, num_classes=5)
+    model.eval()
+
+    with torch.no_grad():
+        start = time.perf_counter()
+        for _ in range(num_images):
+            model(image)
+        end = time.perf_counter()
+        time_torch = end - start
+
+    print(
+        f"PyTorch model on CPU: {time_torch / num_images:.3f} seconds per image, "
+        f"FPS: {num_images / time_torch:.2f}"
+    )
+
+    return num_images / time_torch
+
+
+def plot_fps(v: dict):
+    x = list(v.keys())
+    y = list(v.values())
+
+    plt.bar(range(len(x)), y, align='center')
+    plt.xticks(range(len(x)), x)
+    for i, v in enumerate(y):
+        plt.text(x=i, y=v+0.5, s=f"{v:.2f}", ha='center')
+    plt.xlabel('model format')
+    plt.ylabel('fps')
+    plt.title('FPS comparison')
+    plt.show()
+    plt.savefig('fps_vs.jpg')
+
+
+def main():
+    image_h = 224
+    image_w = 224
+    onnx_path = "resnet34.onnx"
+    ir_path = "ir_output/resnet34.xml"
+
+    image = np.random.randn(image_h, image_w, 3)
+    normalized_image = normalize(image)
+
+    # Convert the resized images to network input shape
+    # [h, w, c] -> [c, h, w] -> [1, c, h, w]
+    input_image = np.expand_dims(np.transpose(image, (2, 0, 1)), 0)
+    normalized_input_image = np.expand_dims(np.transpose(normalized_image, (2, 0, 1)), 0)
+
+    onnx_fps = onnx_inference(onnx_path, normalized_input_image, num_images=100)
+    ir_fps = ir_inference(ir_path, input_image, num_images=100)
+    pytorch_fps = pytorch_inference(normalized_input_image, num_images=100)
+    plot_fps({"pytorch": round(pytorch_fps, 2),
+              "onnx": round(onnx_fps, 2),
+              "ir": round(ir_fps, 2)})
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/compare_onnx_and_ir.py b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/compare_onnx_and_ir.py
new file mode 100644
index 000000000..c8ac7f32e
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/compare_onnx_and_ir.py
@@ -0,0 +1,65 @@
+import numpy as np
+import onnxruntime
+from openvino.runtime import Core
+
+
+def normalize(image: np.ndarray) -> np.ndarray:
+    """
+    Normalize the image to the given mean and standard deviation
+    """
+    image = image.astype(np.float32)
+    mean = (0.485, 0.456, 0.406)
+    std = (0.229, 0.224, 0.225)
+    image /= 255.0
+    image -= mean
+    image /= std
+    return image
+
+
+def onnx_inference(onnx_path: str, image: np.ndarray):
+    # load onnx model
+    ort_session = onnxruntime.InferenceSession(onnx_path)
+
+    # compute onnx Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: image}
+    res_onnx = ort_session.run(None, ort_inputs)[0]
+    return res_onnx
+
+
+def ir_inference(ir_path: str, image: np.ndarray):
+    # Load the network in Inference Engine
+    ie = Core()
+    model_ir = ie.read_model(model=ir_path)
+    compiled_model_ir = ie.compile_model(model=model_ir, device_name="CPU")
+
+    # Get input and output layers
+    input_layer_ir = next(iter(compiled_model_ir.inputs))
+    output_layer_ir = next(iter(compiled_model_ir.outputs))
+
+    # Run inference on the input image
+    res_ir = compiled_model_ir([image])[output_layer_ir]
+    return res_ir
+
+
+def main():
+    image_h = 224
+    image_w = 224
+    onnx_path = "resnet34.onnx"
+    ir_path = "ir_output/resnet34.xml"
+
+    image = np.random.randn(image_h, image_w, 3)
+    normalized_image = normalize(image)
+
+    # Convert the resized images to network input shape
+    # [h, w, c] -> [c, h, w] -> [1, c, h, w]
+    input_image = np.expand_dims(np.transpose(image, (2, 0, 1)), 0)
+    normalized_input_image = np.expand_dims(np.transpose(normalized_image, (2, 0, 1)), 0)
+
+    onnx_res = onnx_inference(onnx_path, normalized_input_image)
+    ir_res = ir_inference(ir_path, input_image)
+    np.testing.assert_allclose(onnx_res, ir_res, rtol=1e-03, atol=1e-05)
+    print("Exported model has been tested with OpenvinoRuntime, and the result looks good!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/convert_pytorch2onnx.py b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/convert_pytorch2onnx.py
new file mode 100644
index 000000000..9fd00349a
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/convert_pytorch2onnx.py
@@ -0,0 +1,56 @@
+import torch
+import torch.onnx
+import onnx
+import onnxruntime
+import numpy as np
+from torchvision.models import resnet34
+
+device = torch.device("cpu")
+
+
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+
+def main():
+    weights_path = "resNet34(flower).pth"
+    onnx_file_name = "resnet34.onnx"
+    batch_size = 1
+    img_h = 224
+    img_w = 224
+    img_channel = 3
+
+    # create model and load pretrain weights
+    model = resnet34(pretrained=False, num_classes=5)
+    model.load_state_dict(torch.load(weights_path, map_location='cpu'))
+
+    model.eval()
+    # input to the model
+    # [batch, channel, height, width]
+    x = torch.rand(batch_size, img_channel, img_h, img_w, requires_grad=True)
+    torch_out = model(x)
+
+    # export the model
+    torch.onnx.export(model,             # model being run
+                      x,                 # model input (or a tuple for multiple inputs)
+                      onnx_file_name,    # where to save the model (can be a file or file-like object)
+                      verbose=False)
+
+    # check onnx model
+    onnx_model = onnx.load(onnx_file_name)
+    onnx.checker.check_model(onnx_model)
+
+    ort_session = onnxruntime.InferenceSession(onnx_file_name)
+
+    # compute ONNX Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
+    ort_outs = ort_session.run(None, ort_inputs)
+
+    # compare ONNX Runtime and Pytorch results
+    # assert_allclose: Raises an AssertionError if two objects are not equal up to desired tolerance.
+    np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)
+    print("Exported model has been tested with ONNXRuntime, and the result looks good!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/model.py b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/model.py
new file mode 100644
index 000000000..c6faa981c
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/model.py
@@ -0,0 +1,302 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from functools import partial
+
+
+def _make_divisible(ch, divisor=8, min_ch=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_ch is None:
+        min_ch = divisor
+    new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_ch < 0.9 * ch:
+        new_ch += divisor
+    return new_ch
+
+
+class ConvBNActivation(nn.Sequential):
+    def __init__(self,
+                 in_planes: int,
+                 out_planes: int,
+                 kernel_size: int = 3,
+                 stride: int = 1,
+                 groups: int = 1,
+                 norm_layer: Optional[Callable[..., nn.Module]] = None,
+                 activation_layer: Optional[Callable[..., nn.Module]] = None):
+        padding = (kernel_size - 1) // 2
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if activation_layer is None:
+            activation_layer = nn.ReLU6
+        super(ConvBNActivation, self).__init__(nn.Conv2d(in_channels=in_planes,
+                                                         out_channels=out_planes,
+                                                         kernel_size=kernel_size,
+                                                         stride=stride,
+                                                         padding=padding,
+                                                         groups=groups,
+                                                         bias=False),
+                                               norm_layer(out_planes),
+                                               activation_layer(inplace=True))
+
+
+class SqueezeExcitation(nn.Module):
+    def __init__(self, input_c: int, squeeze_factor: int = 4):
+        super(SqueezeExcitation, self).__init__()
+        squeeze_c = _make_divisible(input_c // squeeze_factor, 8)
+        self.fc1 = nn.Conv2d(input_c, squeeze_c, 1)
+        self.fc2 = nn.Conv2d(squeeze_c, input_c, 1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        scale = F.adaptive_avg_pool2d(x, output_size=(1, 1))
+        scale = self.fc1(scale)
+        scale = F.relu(scale, inplace=True)
+        scale = self.fc2(scale)
+        scale = F.hardsigmoid(scale, inplace=True)
+        return scale * x
+
+
+class InvertedResidualConfig:
+    def __init__(self,
+                 input_c: int,
+                 kernel: int,
+                 expanded_c: int,
+                 out_c: int,
+                 use_se: bool,
+                 activation: str,
+                 stride: int,
+                 width_multi: float):
+        self.input_c = self.adjust_channels(input_c, width_multi)
+        self.kernel = kernel
+        self.expanded_c = self.adjust_channels(expanded_c, width_multi)
+        self.out_c = self.adjust_channels(out_c, width_multi)
+        self.use_se = use_se
+        self.use_hs = activation == "HS"  # whether using h-swish activation
+        self.stride = stride
+
+    @staticmethod
+    def adjust_channels(channels: int, width_multi: float):
+        return _make_divisible(channels * width_multi, 8)
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self,
+                 cnf: InvertedResidualConfig,
+                 norm_layer: Callable[..., nn.Module]):
+        super(InvertedResidual, self).__init__()
+
+        if cnf.stride not in [1, 2]:
+            raise ValueError("illegal stride value.")
+
+        self.use_res_connect = (cnf.stride == 1 and cnf.input_c == cnf.out_c)
+
+        layers: List[nn.Module] = []
+        activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
+
+        # expand
+        if cnf.expanded_c != cnf.input_c:
+            layers.append(ConvBNActivation(cnf.input_c,
+                                           cnf.expanded_c,
+                                           kernel_size=1,
+                                           norm_layer=norm_layer,
+                                           activation_layer=activation_layer))
+
+        # depthwise
+        layers.append(ConvBNActivation(cnf.expanded_c,
+                                       cnf.expanded_c,
+                                       kernel_size=cnf.kernel,
+                                       stride=cnf.stride,
+                                       groups=cnf.expanded_c,
+                                       norm_layer=norm_layer,
+                                       activation_layer=activation_layer))
+
+        if cnf.use_se:
+            layers.append(SqueezeExcitation(cnf.expanded_c))
+
+        # project
+        layers.append(ConvBNActivation(cnf.expanded_c,
+                                       cnf.out_c,
+                                       kernel_size=1,
+                                       norm_layer=norm_layer,
+                                       activation_layer=nn.Identity))
+
+        self.block = nn.Sequential(*layers)
+        self.out_channels = cnf.out_c
+        self.is_strided = cnf.stride > 1
+
+    def forward(self, x: Tensor) -> Tensor:
+        result = self.block(x)
+        if self.use_res_connect:
+            result += x
+
+        return result
+
+
+class MobileNetV3(nn.Module):
+    def __init__(self,
+                 inverted_residual_setting: List[InvertedResidualConfig],
+                 last_channel: int,
+                 num_classes: int = 1000,
+                 block: Optional[Callable[..., nn.Module]] = None,
+                 norm_layer: Optional[Callable[..., nn.Module]] = None):
+        super(MobileNetV3, self).__init__()
+
+        if not inverted_residual_setting:
+            raise ValueError("The inverted_residual_setting should not be empty.")
+        elif not (isinstance(inverted_residual_setting, List) and
+                  all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting])):
+            raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]")
+
+        if block is None:
+            block = InvertedResidual
+
+        if norm_layer is None:
+            norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)
+
+        layers: List[nn.Module] = []
+
+        # building first layer
+        firstconv_output_c = inverted_residual_setting[0].input_c
+        layers.append(ConvBNActivation(3,
+                                       firstconv_output_c,
+                                       kernel_size=3,
+                                       stride=2,
+                                       norm_layer=norm_layer,
+                                       activation_layer=nn.Hardswish))
+        # building inverted residual blocks
+        for cnf in inverted_residual_setting:
+            layers.append(block(cnf, norm_layer))
+
+        # building last several layers
+        lastconv_input_c = inverted_residual_setting[-1].out_c
+        lastconv_output_c = 6 * lastconv_input_c
+        layers.append(ConvBNActivation(lastconv_input_c,
+                                       lastconv_output_c,
+                                       kernel_size=1,
+                                       norm_layer=norm_layer,
+                                       activation_layer=nn.Hardswish))
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Sequential(nn.Linear(lastconv_output_c, last_channel),
+                                        nn.Hardswish(inplace=True),
+                                        nn.Dropout(p=0.2, inplace=True),
+                                        nn.Linear(last_channel, num_classes))
+
+        # initial weights
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def mobilenet_v3_large(num_classes: int = 1000,
+                       reduced_tail: bool = False) -> MobileNetV3:
+    """
+    Constructs a large MobileNetV3 architecture from
+    "Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>.
+
+    weights_link:
+    https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth
+
+    Args:
+        num_classes (int): number of classes
+        reduced_tail (bool): If True, reduces the channel counts of all feature layers
+            between C4 and C5 by 2. It is used to reduce the channel redundancy in the
+            backbone for Detection and Segmentation.
+    """
+    width_multi = 1.0
+    bneck_conf = partial(InvertedResidualConfig, width_multi=width_multi)
+    adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_multi=width_multi)
+
+    reduce_divider = 2 if reduced_tail else 1
+
+    inverted_residual_setting = [
+        # input_c, kernel, expanded_c, out_c, use_se, activation, stride
+        bneck_conf(16, 3, 16, 16, False, "RE", 1),
+        bneck_conf(16, 3, 64, 24, False, "RE", 2),  # C1
+        bneck_conf(24, 3, 72, 24, False, "RE", 1),
+        bneck_conf(24, 5, 72, 40, True, "RE", 2),  # C2
+        bneck_conf(40, 5, 120, 40, True, "RE", 1),
+        bneck_conf(40, 5, 120, 40, True, "RE", 1),
+        bneck_conf(40, 3, 240, 80, False, "HS", 2),  # C3
+        bneck_conf(80, 3, 200, 80, False, "HS", 1),
+        bneck_conf(80, 3, 184, 80, False, "HS", 1),
+        bneck_conf(80, 3, 184, 80, False, "HS", 1),
+        bneck_conf(80, 3, 480, 112, True, "HS", 1),
+        bneck_conf(112, 3, 672, 112, True, "HS", 1),
+        bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2),  # C4
+        bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1),
+        bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1),
+    ]
+    last_channel = adjust_channels(1280 // reduce_divider)  # C5
+
+    return MobileNetV3(inverted_residual_setting=inverted_residual_setting,
+                       last_channel=last_channel,
+                       num_classes=num_classes)
+
+
+def mobilenet_v3_small(num_classes: int = 1000,
+                       reduced_tail: bool = False) -> MobileNetV3:
+    """
+    Constructs a large MobileNetV3 architecture from
+    "Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>.
+
+    weights_link:
+    https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth
+
+    Args:
+        num_classes (int): number of classes
+        reduced_tail (bool): If True, reduces the channel counts of all feature layers
+            between C4 and C5 by 2. It is used to reduce the channel redundancy in the
+            backbone for Detection and Segmentation.
+    """
+    width_multi = 1.0
+    bneck_conf = partial(InvertedResidualConfig, width_multi=width_multi)
+    adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_multi=width_multi)
+
+    reduce_divider = 2 if reduced_tail else 1
+
+    inverted_residual_setting = [
+        # input_c, kernel, expanded_c, out_c, use_se, activation, stride
+        bneck_conf(16, 3, 16, 16, True, "RE", 2),  # C1
+        bneck_conf(16, 3, 72, 24, False, "RE", 2),  # C2
+        bneck_conf(24, 3, 88, 24, False, "RE", 1),
+        bneck_conf(24, 5, 96, 40, True, "HS", 2),  # C3
+        bneck_conf(40, 5, 240, 40, True, "HS", 1),
+        bneck_conf(40, 5, 240, 40, True, "HS", 1),
+        bneck_conf(40, 5, 120, 48, True, "HS", 1),
+        bneck_conf(48, 5, 144, 48, True, "HS", 1),
+        bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2),  # C4
+        bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1),
+        bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1)
+    ]
+    last_channel = adjust_channels(1024 // reduce_divider)  # C5
+
+    return MobileNetV3(inverted_residual_setting=inverted_residual_setting,
+                       last_channel=last_channel,
+                       num_classes=num_classes)
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/quantization_int8.py b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/quantization_int8.py
new file mode 100644
index 000000000..a6d663735
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/quantization_int8.py
@@ -0,0 +1,84 @@
+from addict import Dict
+from compression.engines.ie_engine import IEEngine
+from compression.graph import load_model, save_model
+from compression.graph.model_utils import compress_model_weights
+from compression.pipeline.initializer import create_pipeline
+from utils import MyDataLoader, Accuracy, read_split_data
+
+
+def main():
+    data_path = "/data/flower_photos"
+    ir_model_xml = "ir_output/resnet34.xml"
+    ir_model_bin = "ir_output/resnet34.bin"
+    save_dir = "quant_ir_output"
+    model_name = "quantized_resnet34"
+    img_w = 224
+    img_h = 224
+
+    model_config = Dict({
+        'model_name': 'resnet34',
+        'model': ir_model_xml,
+        'weights': ir_model_bin
+    })
+    engine_config = Dict({
+        'device': 'CPU',
+        'stat_requests_number': 2,
+        'eval_requests_number': 2
+    })
+    dataset_config = {
+        'data_source': data_path
+    }
+    algorithms = [
+        {
+            'name': 'DefaultQuantization',
+            'params': {
+                'target_device': 'CPU',
+                'preset': 'performance',
+                'stat_subset_size': 300
+            }
+        }
+    ]
+
+    # Steps 1-7: Model optimization
+    # Step 1: Load the model.
+    model = load_model(model_config)
+
+    # Step 2: Initialize the data loader.
+    _, _, val_images_path, val_images_label = read_split_data(data_path, val_rate=0.2)
+    data_loader = MyDataLoader(dataset_config, val_images_path, val_images_label, img_w, img_h)
+
+    # Step 3 (Optional. Required for AccuracyAwareQuantization): Initialize the metric.
+    metric = Accuracy(top_k=1)
+
+    # Step 4: Initialize the engine for metric calculation and statistics collection.
+    engine = IEEngine(engine_config, data_loader, metric)
+
+    # Step 5: Create a pipeline of compression algorithms.
+    pipeline = create_pipeline(algorithms, engine)
+
+    # Step 6: Execute the pipeline.
+    compressed_model = pipeline.run(model)
+
+    # Step 7 (Optional): Compress model weights quantized precision
+    #                    in order to reduce the size of final .bin file.
+    compress_model_weights(compressed_model)
+
+    # Step 8: Save the compressed model to the desired path.
+    compressed_model_paths = save_model(model=compressed_model,
+                                        save_path=save_dir,
+                                        model_name=model_name)
+
+    # Step 9: Compare accuracy of the original and quantized models.
+    metric_results = pipeline.evaluate(model)
+    if metric_results:
+        for name, value in metric_results.items():
+            print(f"Accuracy of the original model: {name}: {value}")
+
+    metric_results = pipeline.evaluate(compressed_model)
+    if metric_results:
+        for name, value in metric_results.items():
+            print(f"Accuracy of the optimized model: {name}: {value}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/requirements.txt b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/requirements.txt
new file mode 100644
index 000000000..662c48d20
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/requirements.txt
@@ -0,0 +1,7 @@
+torch==1.11.0
+torchvision==0.12.0
+onnx==1.13.0
+onnxruntime==1.8.0
+protobuf==3.19.5
+openvino-dev==2022.1.0
+matplotlib
\ No newline at end of file
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/utils.py b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/utils.py
new file mode 100644
index 000000000..62d0ae03c
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_resnet34/utils.py
@@ -0,0 +1,137 @@
+import os
+import json
+import random
+
+from PIL import Image
+import numpy as np
+from compression.api import DataLoader, Metric
+from torchvision.transforms import transforms
+
+
+def read_split_data(root: str, val_rate: float = 0.2):
+    random.seed(0)  # 保证随机结果可复现
+    assert os.path.exists(root), "dataset root: {} does not exist.".format(root)
+
+    # 遍历文件夹，一个文件夹对应一个类别
+    flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
+    # 排序，保证顺序一致
+    flower_class.sort()
+    # 生成类别名称以及对应的数字索引
+    class_indices = dict((k, v) for v, k in enumerate(flower_class))
+    json_str = json.dumps(dict((val, key) for key, val in class_indices.items()), indent=4)
+    with open('class_indices.json', 'w') as json_file:
+        json_file.write(json_str)
+
+    train_images_path = []  # 存储训练集的所有图片路径
+    train_images_label = []  # 存储训练集图片对应索引信息
+    val_images_path = []  # 存储验证集的所有图片路径
+    val_images_label = []  # 存储验证集图片对应索引信息
+    every_class_num = []  # 存储每个类别的样本总数
+    supported = [".jpg", ".JPG", ".png", ".PNG"]  # 支持的文件后缀类型
+    # 遍历每个文件夹下的文件
+    for cla in flower_class:
+        cla_path = os.path.join(root, cla)
+        # 遍历获取supported支持的所有文件路径
+        images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
+                  if os.path.splitext(i)[-1] in supported]
+        # 获取该类别对应的索引
+        image_class = class_indices[cla]
+        # 记录该类别的样本数量
+        every_class_num.append(len(images))
+        # 按比例随机采样验证样本
+        val_path = random.sample(images, k=int(len(images) * val_rate))
+
+        for img_path in images:
+            if img_path in val_path:  # 如果该路径在采样的验证集样本中则存入验证集
+                val_images_path.append(img_path)
+                val_images_label.append(image_class)
+            else:  # 否则存入训练集
+                train_images_path.append(img_path)
+                train_images_label.append(image_class)
+
+    print("{} images were found in the dataset.".format(sum(every_class_num)))
+    print("{} images for training.".format(len(train_images_path)))
+    print("{} images for validation.".format(len(val_images_path)))
+
+    return train_images_path, train_images_label, val_images_path, val_images_label
+
+
+# Custom implementation of classification accuracy metric.
+class Accuracy(Metric):
+    # Required methods
+    def __init__(self, top_k=1):
+        super().__init__()
+        self._top_k = top_k
+        self._name = 'accuracy@top{}'.format(self._top_k)
+        self._matches = []
+
+    @property
+    def value(self):
+        """ Returns accuracy metric value for the last model output. """
+        return {self._name: self._matches[-1]}
+
+    @property
+    def avg_value(self):
+        """ Returns accuracy metric value for all model outputs. """
+        return {self._name: np.ravel(self._matches).mean()}
+
+    def update(self, output, target):
+        """ Updates prediction matches.
+        :param output: model output
+        :param target: annotations
+        """
+        if len(output) > 1:
+            raise Exception('The accuracy metric cannot be calculated '
+                            'for a model with multiple outputs')
+        if isinstance(target, dict):
+            target = list(target.values())
+        predictions = np.argsort(output[0], axis=1)[:, -self._top_k:]
+        match = [float(t in predictions[i]) for i, t in enumerate(target)]
+
+        self._matches.append(match)
+
+    def reset(self):
+        """ Resets collected matches """
+        self._matches = []
+
+    def get_attributes(self):
+        """
+        Returns a dictionary of metric attributes {metric_name: {attribute_name: value}}.
+        Required attributes: 'direction': 'higher-better' or 'higher-worse'
+                             'type': metric type
+        """
+        return {self._name: {'direction': 'higher-better',
+                             'type': 'accuracy'}}
+
+
+class MyDataLoader(DataLoader):
+    def __init__(self, cfg, images_path: list, images_label: list, img_w: int = 224, img_h: int = 224):
+        super().__init__(cfg)
+        self.images_path = images_path
+        self.images_label = images_label
+        self.image_w = img_w
+        self.image_h = img_h
+        self.transforms = transforms.Compose([
+            transforms.Resize(min(img_h, img_w)),
+            transforms.CenterCrop((img_h, img_w))
+        ])
+
+    def __len__(self):
+        return len(self.images_label)
+
+    def __getitem__(self, index):
+        """
+        Return one sample of index, label and picture.
+        :param index: index of the taken sample.
+        """
+        if index >= len(self):
+            raise IndexError
+
+        img = Image.open(self.images_path[index])
+        img = self.transforms(img)
+
+        # Convert the resized images to network input shape
+        # [h, w, c] -> [c, h, w] -> [1, c, h, w]
+        img = np.expand_dims(np.transpose(np.array(img), (2, 0, 1)), 0)
+
+        return (index, self.images_label[index]), img
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/README.md b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/README.md
new file mode 100644
index 000000000..682bb111f
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/README.md
@@ -0,0 +1,61 @@
+OpenVINO量化YOLOv5
+
+1. 按照`requirements.txt`配置环境
+2. 将YOLOv5转为ONNX
+YOLOv5官方有提供导出ONNX以及OpenVINO的方法，但我这里仅导出成ONNX，这里以YOLOv5s为例
+```
+python export.py --weights yolov5s.pt --include onnx
+```
+
+3. ONNX转换为IR
+使用OpenVINO的`mo`工具将ONNX转为OpenVINO的IR格式
+```
+mo  --input_model yolov5s.onnx \
+    --input_shape "[1,3,640,640]" \
+    --scale 255 \
+    --data_type FP32 \
+    --output_dir ir_output
+```
+
+4. 量化模型
+使用`quantization_int8.py`进行模型的量化，量化过程中需要使用到COCO2017数据集，需要将`data_path`指向coco2017目录
+```
+├── coco2017: 数据集根目录
+     ├── train2017: 所有训练图像文件夹(118287张)
+     ├── val2017: 所有验证图像文件夹(5000张)
+     └── annotations: 对应标注文件夹
+              ├── instances_train2017.json: 对应目标检测、分割任务的训练集标注文件
+              ├── instances_val2017.json: 对应目标检测、分割任务的验证集标注文件
+              ├── captions_train2017.json: 对应图像描述的训练集标注文件
+              ├── captions_val2017.json: 对应图像描述的验证集标注文件
+              ├── person_keypoints_train2017.json: 对应人体关键点检测的训练集标注文件
+              └── person_keypoints_val2017.json: 对应人体关键点检测的验证集标注文件夹
+```
+
+5. benchmark
+直接利用`benchmark_app`工具测试量化前后的`Throughput`，这里以`CPU: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz`设备为例
+```
+benchmark_app -m ir_output/yolov5s.xml -d CPU -api sync
+```
+output：
+```
+Latency:
+    Median:     59.56 ms
+    AVG:        63.30 ms
+    MIN:        57.88 ms
+    MAX:        99.89 ms
+Throughput: 16.79 FPS
+```
+
+```
+benchmark_app -m quant_ir_output/quantized_yolov5s.xml -d CPU -api sync
+```
+output:
+```
+Latency:
+    Median:     42.97 ms
+    AVG:        46.56 ms
+    MIN:        41.18 ms
+    MAX:        95.75 ms
+Throughput: 23.27 FPS
+```
\ No newline at end of file
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/compare_fps.py b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/compare_fps.py
new file mode 100644
index 000000000..0a4abfd84
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/compare_fps.py
@@ -0,0 +1,121 @@
+import time
+import numpy as np
+import torch
+import onnxruntime
+import matplotlib.pyplot as plt
+from openvino.runtime import Core
+
+
+def normalize(image: np.ndarray) -> np.ndarray:
+    """
+    Normalize the image to the given mean and standard deviation
+    """
+    image = image.astype(np.float32)
+    image /= 255.0
+    return image
+
+
+def onnx_inference(onnx_path: str, image: np.ndarray, num_images: int = 20):
+    # load onnx model
+    ort_session = onnxruntime.InferenceSession(onnx_path)
+
+    # compute onnx Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: image}
+
+    start = time.perf_counter()
+    for _ in range(num_images):
+        ort_session.run(None, ort_inputs)
+    end = time.perf_counter()
+    time_onnx = end - start
+    print(
+        f"ONNX model in Inference Engine/CPU: {time_onnx / num_images:.3f} "
+        f"seconds per image, FPS: {num_images / time_onnx:.2f}"
+    )
+
+    return num_images / time_onnx
+
+
+def ir_inference(ir_path: str, image: np.ndarray, num_images: int = 20):
+    # Load the network in Inference Engine
+    ie = Core()
+    model_ir = ie.read_model(model=ir_path)
+    compiled_model_ir = ie.compile_model(model=model_ir, device_name="CPU")
+
+    # Get input and output layers
+    input_layer_ir = next(iter(compiled_model_ir.inputs))
+    output_layer_ir = next(iter(compiled_model_ir.outputs))
+
+    start = time.perf_counter()
+    request_ir = compiled_model_ir.create_infer_request()
+    for _ in range(num_images):
+        request_ir.infer(inputs={input_layer_ir.any_name: image})
+    end = time.perf_counter()
+    time_ir = end - start
+    print(
+        f"IR model in Inference Engine/CPU: {time_ir / num_images:.3f} "
+        f"seconds per image, FPS: {num_images / time_ir:.2f}"
+    )
+
+    return num_images / time_ir
+
+
+def pytorch_inference(image: np.ndarray, num_images: int = 20):
+    image = torch.as_tensor(image, dtype=torch.float32)
+
+    model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
+    model.eval()
+
+    with torch.no_grad():
+        start = time.perf_counter()
+        for _ in range(num_images):
+            model(image)
+        end = time.perf_counter()
+        time_torch = end - start
+
+    print(
+        f"PyTorch model on CPU: {time_torch / num_images:.3f} seconds per image, "
+        f"FPS: {num_images / time_torch:.2f}"
+    )
+
+    return num_images / time_torch
+
+
+def plot_fps(v: dict):
+    x = list(v.keys())
+    y = list(v.values())
+
+    plt.bar(range(len(x)), y, align='center')
+    plt.xticks(range(len(x)), x)
+    for i, v in enumerate(y):
+        plt.text(x=i, y=v+0.5, s=f"{v:.2f}", ha='center')
+    plt.xlabel('model format')
+    plt.ylabel('fps')
+    plt.title('FPS comparison')
+    plt.show()
+    plt.savefig('fps_vs.jpg')
+
+
+def main():
+    image_h = 640
+    image_w = 640
+    onnx_path = "yolov5s.onnx"
+    ir_path = "ir_output/yolov5s.xml"
+
+    image = np.random.randn(image_h, image_w, 3)
+    normalized_image = normalize(image)
+
+    # Convert the resized images to network input shape
+    # [h, w, c] -> [c, h, w] -> [1, c, h, w]
+    input_image = np.expand_dims(np.transpose(image, (2, 0, 1)), 0)
+    normalized_input_image = np.expand_dims(np.transpose(normalized_image, (2, 0, 1)), 0)
+
+    onnx_fps = onnx_inference(onnx_path, normalized_input_image, num_images=100)
+    ir_fps = ir_inference(ir_path, input_image, num_images=100)
+    pytorch_fps = pytorch_inference(normalized_input_image, num_images=100)
+    plot_fps({"pytorch": round(pytorch_fps, 2),
+              "onnx": round(onnx_fps, 2),
+              "ir": round(ir_fps, 2)})
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/compare_onnx_and_ir.py b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/compare_onnx_and_ir.py
new file mode 100644
index 000000000..110f22e3c
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/compare_onnx_and_ir.py
@@ -0,0 +1,61 @@
+import numpy as np
+import onnxruntime
+from openvino.runtime import Core
+
+
+def normalize(image: np.ndarray) -> np.ndarray:
+    """
+    Normalize the image to the given mean and standard deviation
+    """
+    image = image.astype(np.float32)
+    image /= 255.0
+    return image
+
+
+def onnx_inference(onnx_path: str, image: np.ndarray):
+    # load onnx model
+    ort_session = onnxruntime.InferenceSession(onnx_path)
+
+    # compute onnx Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: image}
+    res_onnx = ort_session.run(None, ort_inputs)[0]
+    return res_onnx
+
+
+def ir_inference(ir_path: str, image: np.ndarray):
+    # Load the network in Inference Engine
+    ie = Core()
+    model_ir = ie.read_model(model=ir_path)
+    compiled_model_ir = ie.compile_model(model=model_ir, device_name="CPU")
+
+    # Get input and output layers
+    input_layer_ir = next(iter(compiled_model_ir.inputs))
+    output_layer_ir = next(iter(compiled_model_ir.outputs))
+
+    # Run inference on the input image
+    res_ir = compiled_model_ir([image])[output_layer_ir]
+    return res_ir
+
+
+def main():
+    image_h = 640
+    image_w = 640
+    onnx_path = "yolov5s.onnx"
+    ir_path = "ir_output/yolov5s.xml"
+
+    image = np.random.randn(image_h, image_w, 3)
+    normalized_image = normalize(image)
+
+    # Convert the resized images to network input shape
+    # [h, w, c] -> [c, h, w] -> [1, c, h, w]
+    input_image = np.expand_dims(np.transpose(image, (2, 0, 1)), 0)
+    normalized_input_image = np.expand_dims(np.transpose(normalized_image, (2, 0, 1)), 0)
+
+    onnx_res = onnx_inference(onnx_path, normalized_input_image)
+    ir_res = ir_inference(ir_path, input_image)
+    np.testing.assert_allclose(onnx_res, ir_res, rtol=1e-03, atol=1e-05)
+    print("Exported model has been tested with OpenvinoRuntime, and the result looks good!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/draw_box_utils.py b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/draw_box_utils.py
new file mode 100644
index 000000000..835d7f7c1
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/draw_box_utils.py
@@ -0,0 +1,153 @@
+from PIL.Image import Image, fromarray
+import PIL.ImageDraw as ImageDraw
+import PIL.ImageFont as ImageFont
+from PIL import ImageColor
+import numpy as np
+
+STANDARD_COLORS = [
+    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
+    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
+    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
+    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
+    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
+    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
+    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
+    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
+    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
+    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
+    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
+    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
+    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
+    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
+    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
+    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
+    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
+    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
+    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
+    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
+    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
+    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
+    'WhiteSmoke', 'Yellow', 'YellowGreen'
+]
+
+
+def draw_text(draw,
+              box: list,
+              cls: int,
+              score: float,
+              category_index: dict,
+              color: str,
+              font: str = 'arial.ttf',
+              font_size: int = 24):
+    """
+    将目标边界框和类别信息绘制到图片上
+    """
+    try:
+        font = ImageFont.truetype(font, font_size)
+    except IOError:
+        font = ImageFont.load_default()
+
+    left, top, right, bottom = box
+    # If the total height of the display strings added to the top of the bounding
+    # box exceeds the top of the image, stack the strings below the bounding box
+    # instead of above.
+    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
+    display_str_heights = [font.getsize(ds)[1] for ds in display_str]
+    # Each display_str has a top and bottom margin of 0.05x.
+    display_str_height = (1 + 2 * 0.05) * max(display_str_heights)
+
+    if top > display_str_height:
+        text_top = top - display_str_height
+        text_bottom = top
+    else:
+        text_top = bottom
+        text_bottom = bottom + display_str_height
+
+    for ds in display_str:
+        text_width, text_height = font.getsize(ds)
+        margin = np.ceil(0.05 * text_width)
+        draw.rectangle([(left, text_top),
+                        (left + text_width + 2 * margin, text_bottom)], fill=color)
+        draw.text((left + margin, text_top),
+                  ds,
+                  fill='black',
+                  font=font)
+        left += text_width
+
+
+def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
+    np_image = np.array(image)
+    masks = np.where(masks > thresh, True, False)
+
+    # colors = np.array(colors)
+    img_to_draw = np.copy(np_image)
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors):
+        img_to_draw[mask] = color
+
+    out = np_image * (1 - alpha) + img_to_draw * alpha
+    return fromarray(out.astype(np.uint8))
+
+
+def draw_objs(image: Image,
+              boxes: np.ndarray = None,
+              classes: np.ndarray = None,
+              scores: np.ndarray = None,
+              masks: np.ndarray = None,
+              category_index: dict = None,
+              box_thresh: float = 0.1,
+              mask_thresh: float = 0.5,
+              line_thickness: int = 8,
+              font: str = 'arial.ttf',
+              font_size: int = 24,
+              draw_boxes_on_image: bool = True,
+              draw_masks_on_image: bool = False):
+    """
+    将目标边界框信息，类别信息，mask信息绘制在图片上
+    Args:
+        image: 需要绘制的图片
+        boxes: 目标边界框信息
+        classes: 目标类别信息
+        scores: 目标概率信息
+        masks: 目标mask信息
+        category_index: 类别与名称字典
+        box_thresh: 过滤的概率阈值
+        mask_thresh:
+        line_thickness: 边界框宽度
+        font: 字体类型
+        font_size: 字体大小
+        draw_boxes_on_image:
+        draw_masks_on_image:
+
+    Returns:
+
+    """
+
+    # 过滤掉低概率的目标
+    idxs = np.greater(scores, box_thresh)
+    boxes = boxes[idxs]
+    classes = classes[idxs]
+    scores = scores[idxs]
+    if masks is not None:
+        masks = masks[idxs]
+    if len(boxes) == 0:
+        return image
+
+    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
+
+    if draw_boxes_on_image:
+        # Draw all boxes onto image.
+        draw = ImageDraw.Draw(image)
+        for box, cls, score, color in zip(boxes, classes, scores, colors):
+            left, top, right, bottom = box
+            # 绘制目标边界框
+            draw.line([(left, top), (left, bottom), (right, bottom),
+                       (right, top), (left, top)], width=line_thickness, fill=color)
+            # 绘制类别和概率信息
+            draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
+
+    if draw_masks_on_image and (masks is not None):
+        # Draw all mask onto image.
+        image = draw_masks(image, masks, colors, mask_thresh)
+
+    return image
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/evaluation.py b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/evaluation.py
new file mode 100644
index 000000000..96f1ada13
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/evaluation.py
@@ -0,0 +1,44 @@
+from tqdm import tqdm
+import torch
+from openvino.runtime import Core
+from utils import MyDataLoader, EvalCOCOMetric, non_max_suppression
+
+
+def main():
+    data_path = "/data/coco2017"
+    ir_model_xml = "quant_ir_output/quantized_yolov5s.xml"
+    img_size = (640, 640)  # h, w
+
+    data_loader = MyDataLoader(data_path, "val", size=img_size)
+    coco80_to_91 = data_loader.coco_id80_to_id91
+    metrics = EvalCOCOMetric(coco=data_loader.coco, classes_mapping=coco80_to_91)
+
+    # Load the network in Inference Engine
+    ie = Core()
+    model_ir = ie.read_model(model=ir_model_xml)
+    compiled_model = ie.compile_model(model=model_ir, device_name="CPU")
+    inputs_names = compiled_model.inputs
+    outputs_names = compiled_model.outputs
+
+    # inference
+    request = compiled_model.create_infer_request()
+    for i in tqdm(range(len(data_loader))):
+        data = data_loader[i]
+        ann, img, info = data
+        ann = ann + (info,)
+
+        request.infer(inputs={inputs_names[0]: img})
+        result = request.get_output_tensor(outputs_names[0].index).data
+
+        # post-process
+        result = non_max_suppression(torch.Tensor(result), conf_thres=0.001, iou_thres=0.6, multi_label=True)[0]
+        boxes = result[:, :4].numpy()
+        scores = result[:, 4].numpy()
+        cls = result[:, 5].numpy().astype(int)
+        metrics.update(ann, [boxes, cls, scores])
+
+    metrics.evaluate()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/predict.py b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/predict.py
new file mode 100644
index 000000000..6f01b5709
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/predict.py
@@ -0,0 +1,50 @@
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+import matplotlib.pyplot as plt
+from openvino.runtime import Core
+from utils import letterbox, scale_coords, non_max_suppression, coco80_names
+from draw_box_utils import draw_objs
+
+
+def main():
+    img_path = "test.jpg"
+    ir_model_xml = "ir_output/yolov5s.xml"
+    img_size = (640, 640)  # h, w
+
+    origin_img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
+    reshape_img, ratio, pad = letterbox(origin_img, img_size, auto=False)
+    input_img = np.expand_dims(np.transpose(reshape_img, [2, 0, 1]), 0).astype(np.float32)
+
+    # Load the network in Inference Engine
+    ie = Core()
+    model_ir = ie.read_model(model=ir_model_xml)
+    compiled_model = ie.compile_model(model=model_ir, device_name="CPU")
+    inputs_names = compiled_model.inputs
+    outputs_names = compiled_model.outputs
+
+    # inference
+    request = compiled_model.create_infer_request()
+    request.infer(inputs={inputs_names[0]: input_img})
+    result = request.get_output_tensor(outputs_names[0].index).data
+
+    # post-process
+    result = non_max_suppression(torch.Tensor(result))[0]
+    boxes = result[:, :4].numpy()
+    scores = result[:, 4].numpy()
+    cls = result[:, 5].numpy().astype(int)
+    boxes = scale_coords(reshape_img.shape, boxes, origin_img.shape, (ratio, pad))
+
+    draw_img = draw_objs(Image.fromarray(origin_img),
+                         boxes,
+                         cls,
+                         scores,
+                         category_index=dict([(str(i), v) for i, v in enumerate(coco80_names)]))
+    plt.imshow(draw_img)
+    plt.show()
+    draw_img.save("predict.jpg")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/quantization_int8.py b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/quantization_int8.py
new file mode 100644
index 000000000..d0decff0b
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/quantization_int8.py
@@ -0,0 +1,102 @@
+import time
+from addict import Dict
+from compression.engines.ie_engine import IEEngine
+from compression.graph import load_model, save_model
+from compression.graph.model_utils import compress_model_weights
+from compression.pipeline.initializer import create_pipeline
+from yaspin import yaspin
+from utils import MyDataLoader, MAPMetric
+
+
+def main():
+    data_path = "/data/coco2017"
+    ir_model_xml = "ir_output/yolov5s.xml"
+    ir_model_bin = "ir_output/yolov5s.bin"
+    save_dir = "quant_ir_output"
+    model_name = "quantized_yolov5s"
+    img_w = 640
+    img_h = 640
+
+    model_config = Dict({
+        'model_name': 'yolov5s',
+        'model': ir_model_xml,
+        'weights': ir_model_bin,
+        'inputs': 'images',
+        'outputs': 'output'
+    })
+    engine_config = Dict({'device': 'CPU'})
+
+    algorithms = [
+        {
+            'name': 'DefaultQuantization',
+            'params': {
+                'target_device': 'CPU',
+                'preset': 'performance',
+                'stat_subset_size': 300
+            }
+        }
+    ]
+
+    # Step 1: Load the model.
+    model = load_model(model_config)
+
+    # Step 2: Initialize the data loader.
+    data_loader = MyDataLoader(data_path, "val", (img_h, img_w))
+
+    # Step 3: initialize the metric
+    # For DefaultQuantization, specifying a metric is optional: metric can be set to None
+    metric = MAPMetric(map_value="map")
+
+    # Step 4: Initialize the engine for metric calculation and statistics collection.
+    engine = IEEngine(config=engine_config, data_loader=data_loader, metric=metric)
+
+    # Step 5: Create a pipeline of compression algorithms.
+    pipeline = create_pipeline(algorithms, engine)
+
+    # Step 6: Execute the pipeline to quantize the model
+    algorithm_name = pipeline.algo_seq[0].name
+    with yaspin(
+            text=f"Executing POT pipeline on {model_config['model']} with {algorithm_name}"
+    ) as sp:
+        start_time = time.perf_counter()
+        compressed_model = pipeline.run(model)
+        end_time = time.perf_counter()
+        sp.ok("✔")
+    print(f"Quantization finished in {end_time - start_time:.2f} seconds")
+
+    # Step 7 (Optional): Compress model weights to quantized precision
+    #                    in order to reduce the size of the final .bin file
+    compress_model_weights(compressed_model)
+
+    # Step 8: Save the compressed model to the desired path.
+    # Set save_path to the directory where the compressed model should be stored
+    compressed_model_paths = save_model(
+        model=compressed_model,
+        save_path=save_dir,
+        model_name=model_name,
+    )
+
+    compressed_model_path = compressed_model_paths[0]["model"]
+    print("The quantized model is stored at", compressed_model_path)
+
+    # Compute the mAP on the quantized model and compare with the mAP on the FP16 IR model.
+    ir_model = load_model(model_config=model_config)
+    evaluation_pipeline = create_pipeline(algo_config=dict(), engine=engine)
+
+    with yaspin(text="Evaluating original IR model") as sp:
+        original_metric = evaluation_pipeline.evaluate(ir_model)
+
+    if original_metric:
+        for key, value in original_metric.items():
+            print(f"The {key} score of the original model is {value:.5f}")
+
+    with yaspin(text="Evaluating quantized IR model") as sp:
+        quantized_metric = pipeline.evaluate(compressed_model)
+
+    if quantized_metric:
+        for key, value in quantized_metric.items():
+            print(f"The {key} score of the quantized INT8 model is {value:.5f}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/requirements.txt b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/requirements.txt
new file mode 100644
index 000000000..30b17622c
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/requirements.txt
@@ -0,0 +1,8 @@
+torch==1.13.1
+torchvision==0.12.0
+onnx==1.13.0
+onnxruntime==1.8.0
+protobuf==3.19.5
+openvino-dev==2022.1.0
+matplotlib
+torchmetrics==0.9.1
\ No newline at end of file
diff --git a/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/utils.py b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/utils.py
new file mode 100644
index 000000000..e3bcf6d4a
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_openvino/convert_yolov5/utils.py
@@ -0,0 +1,552 @@
+import os
+import time
+import json
+import copy
+
+import cv2
+import numpy as np
+import torch
+from torchmetrics.detection.mean_ap import MeanAveragePrecision
+import torchvision
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from compression.api import DataLoader, Metric
+
+
+coco80_names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+                'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+                'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+                'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+                'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+                'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+                'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
+                'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+                'scissors', 'teddy bear', 'hair drier', 'toothbrush']
+
+
+def box_iou(box1, box2):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    def box_area(box):
+        # box = 4xn
+        return (box[2] - box[0]) * (box[3] - box[1])
+
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
+    return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)
+
+
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+
+def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
+                        labels=(), max_det=300):
+    """Runs Non-Maximum Suppression (NMS) on inference results
+
+    Returns:
+         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
+    """
+
+    nc = prediction.shape[2] - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Checks
+    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
+    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
+
+    # Settings
+    min_wh, max_wh = 2, 7680  # (pixels) minimum and maximum box width and height
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    t = time.time()
+    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            lb = labels[xi]
+            v = torch.zeros((len(lb), nc + 5), device=x.device)
+            v[:, :4] = lb[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(lb)), lb[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 5:].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
+
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+        if (time.time() - t) > time_limit:
+            print(f'WARNING: NMS time limit {time_limit}s exceeded')
+            break  # time limit exceeded
+
+    return output
+
+
+class MAPMetric(Metric):
+    def __init__(self, map_value="map", conf_thres=0.001, iou_thres=0.6):
+        """
+        Mean Average Precision Metric. Wraps torchmetrics implementation, see
+        https://torchmetrics.readthedocs.io/en/stable/detection/mean_average_precision.html
+
+        :map_value: specific metric to return. Default: "map"
+                    Change `to one of the values in the list below to return a different value
+                    ['mar_1', 'mar_10', 'mar_100', 'mar_small', 'mar_medium', 'mar_large',
+                     'map', 'map_50', 'map_75', 'map_small', 'map_medium', 'map_large']
+                    See torchmetrics documentation for more details.
+        """
+
+        self._name = map_value
+        self.metric = MeanAveragePrecision(box_format="xyxy")
+        self.conf_thres = conf_thres
+        self.iou_thres = iou_thres
+        super().__init__()
+
+    @property
+    def value(self):
+        """
+        Returns metric value for the last model output.
+        Possible format: {metric_name: [metric_values_per_image]}
+        """
+        return {self._name: [0]}
+
+    @property
+    def avg_value(self):
+        """
+        Returns average metric value for all model outputs.
+        Possible format: {metric_name: metric_value}
+        """
+        return {self._name: self.metric.compute()[self._name].item()}
+
+    def update(self, output, target):
+        """
+        Convert network output and labels to the format that torchmetrics' MAP
+        implementation expects, and call `metric.update()`.
+
+        :param output: model output
+        :param target: annotations for model output
+        """
+        targetboxes = []
+        targetlabels = []
+        predboxes = []
+        predlabels = []
+        scores = []
+
+        for single_target in target[0]:
+            txmin, tymin, txmax, tymax = single_target["bbox"]
+            category = single_target["category_id"]
+
+            targetbox = [round(txmin), round(tymin), round(txmax), round(tymax)]
+            targetboxes.append(targetbox)
+            targetlabels.append(category)
+
+        output = torch.Tensor(output[0]).float()
+        output = non_max_suppression(output, conf_thres=self.conf_thres, iou_thres=self.iou_thres, multi_label=True)
+        for single_output in output:
+            for pred in single_output.numpy():
+                xmin, ymin, xmax, ymax, conf, label = pred
+
+                predbox = [round(xmin), round(ymin), round(xmax), round(ymax)]
+                predboxes.append(predbox)
+                predlabels.append(label)
+                scores.append(conf)
+
+        preds = [
+            dict(
+                boxes=torch.Tensor(predboxes).float(),
+                labels=torch.Tensor(predlabels).short(),
+                scores=torch.Tensor(scores),
+            )
+        ]
+        targets = [
+            dict(
+                boxes=torch.Tensor(targetboxes).float(),
+                labels=torch.Tensor(targetlabels).short(),
+            )
+        ]
+        self.metric.update(preds, targets)
+
+    def reset(self):
+        """
+        Resets metric
+        """
+        self.metric.reset()
+
+    def get_attributes(self):
+        """
+        Returns a dictionary of metric attributes {metric_name: {attribute_name: value}}.
+        Required attributes: 'direction': 'higher-better' or 'higher-worse'
+                             'type': metric type
+        """
+        return {self._name: {"direction": "higher-better", "type": "mAP"}}
+
+
+def _coco_remove_images_without_annotations(dataset, ids):
+    """
+    删除coco数据集中没有目标，或者目标面积非常小的数据
+    refer to:
+    https://github.com/pytorch/vision/blob/master/references/detection/coco_utils.py
+    :param dataset:
+    :param cat_list:
+    :return:
+    """
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+
+        return True
+
+    valid_ids = []
+    for ds_idx, img_id in enumerate(ids):
+        ann_ids = dataset.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.loadAnns(ann_ids)
+
+        if _has_valid_annotation(anno):
+            valid_ids.append(img_id)
+
+    return valid_ids
+
+
+def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        assert ratio_pad[0][0] == ratio_pad[0][1]
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    coords[:, [0, 2]] -= pad[0]  # x padding
+    coords[:, [1, 3]] -= pad[1]  # y padding
+    coords[:, :4] /= gain
+    clip_coords(coords, img0_shape)
+    return coords
+
+
+def clip_coords(boxes, shape):
+    # Clip bounding xyxy bounding boxes to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[:, 0].clamp_(0, shape[1])  # x1
+        boxes[:, 1].clamp_(0, shape[0])  # y1
+        boxes[:, 2].clamp_(0, shape[1])  # x2
+        boxes[:, 3].clamp_(0, shape[0])  # y2
+    else:  # np.array (faster grouped)
+        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, ratio, (left, top)
+
+
+class MyDataLoader(DataLoader):
+    """`MS Coco Detection <https://cocodataset.org/>`_ Dataset.
+
+    Args:
+        root (string): Root directory where images are downloaded to.
+        dataset (string): "train" or "val.
+        size (tuple): (h, w)
+    """
+    def __init__(self, root, dataset="train", size=(640, 640)):
+        assert dataset in ["train", "val"], 'dataset must be in ["train", "val"]'
+        anno_file = "instances_{}2017.json".format(dataset)
+        assert os.path.exists(root), "file '{}' does not exist.".format(root)
+        self.img_root = os.path.join(root, "{}2017".format(dataset))
+        assert os.path.exists(self.img_root), "path '{}' does not exist.".format(self.img_root)
+        self.anno_path = os.path.join(root, "annotations", anno_file)
+        assert os.path.exists(self.anno_path), "file '{}' does not exist.".format(self.anno_path)
+
+        self.mode = dataset
+        self.size = size
+        self.coco = COCO(self.anno_path)
+
+        self.coco91_id2classes = dict([(v["id"], v["name"]) for k, v in self.coco.cats.items()])
+        coco90_classes2id = dict([(v["name"], v["id"]) for k, v in self.coco.cats.items()])
+
+        self.coco80_classes = coco80_names
+        self.coco_id80_to_id91 = dict([(i, coco90_classes2id[k]) for i, k in enumerate(coco80_names)])
+
+        ids = list(sorted(self.coco.imgs.keys()))
+
+        # 移除没有目标，或者目标面积非常小的数据
+        valid_ids = _coco_remove_images_without_annotations(self.coco, ids)
+        self.ids = valid_ids
+
+    def parse_targets(self,
+                      coco_targets: list,
+                      w: int = None,
+                      h: int = None,
+                      ratio: tuple = None,
+                      pad: tuple = None):
+        assert w > 0
+        assert h > 0
+
+        # 只筛选出单个对象的情况
+        anno = [obj for obj in coco_targets if obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+
+        # guard against no boxes via resizing
+        boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)
+        # [xmin, ymin, w, h] -> [xmin, ymin, xmax, ymax]
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2] = np.clip(boxes[:, 0::2], a_min=0, a_max=w)
+        boxes[:, 1::2] = np.clip(boxes[:, 1::2], a_min=0, a_max=h)
+
+        classes = [self.coco80_classes.index(self.coco91_id2classes[obj["category_id"]])
+                   for obj in anno]
+        classes = np.array(classes, dtype=int)
+
+        # 筛选出合法的目标，即x_max>x_min且y_max>y_min
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+
+        if ratio is not None:
+            # width, height ratios
+            boxes[:, 0::2] *= ratio[0]
+            boxes[:, 1::2] *= ratio[1]
+
+        if pad is not None:
+            # dw, dh padding
+            dw, dh = pad
+            boxes[:, 0::2] += dw
+            boxes[:, 1::2] += dh
+
+        target_annotations = []
+        for i in range(boxes.shape[0]):
+            target_annotation = {
+                "category_id": int(classes[i]),
+                "bbox": boxes[i].tolist()
+            }
+            target_annotations.append(target_annotation)
+
+        return target_annotations
+
+    def __getitem__(self, index):
+        """
+        Get an item from the dataset at the specified index.
+        Detection boxes are converted from absolute coordinates to relative coordinates
+        between 0 and 1 by dividing xmin, xmax by image width and ymin, ymax by image height.
+
+        :return: (annotation, input_image, metadata) where annotation is (index, target_annotation)
+                 with target_annotation as a dictionary with keys category_id, image_width, image_height
+                 and bbox, containing the relative bounding box coordinates [xmin, ymin, xmax, ymax]
+                 (with values between 0 and 1) and metadata a dictionary: {"filename": path_to_image}
+        """
+        coco = self.coco
+        img_id = self.ids[index]
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        coco_target = coco.loadAnns(ann_ids)
+
+        image_path = coco.loadImgs(img_id)[0]['file_name']
+        img = cv2.imread(os.path.join(self.img_root, image_path))
+
+        origin_h, origin_w, c = img.shape
+        image, ratio, pad = letterbox(img, auto=False, new_shape=self.size)
+        target_annotations = self.parse_targets(coco_target, origin_w, origin_h, ratio, pad)
+
+        item_annotation = (index, target_annotations)
+        input_image = np.expand_dims(image.transpose(2, 0, 1), axis=0).astype(
+            np.float32
+        )
+        return (
+            item_annotation,
+            input_image,
+            {"filename": str(image_path),
+             "origin_shape": img.shape,
+             "shape": image.shape,
+             "img_id": img_id,
+             "ratio_pad": [ratio, pad]},
+        )
+
+    def __len__(self):
+        return len(self.ids)
+
+    @staticmethod
+    def collate_fn(x):
+        return x
+
+
+class EvalCOCOMetric:
+    def __init__(self,
+                 coco: COCO = None,
+                 iou_type: str = "bbox",
+                 results_file_name: str = "predict_results.json",
+                 classes_mapping: dict = None):
+        self.coco = copy.deepcopy(coco)
+        self.results = []
+        self.classes_mapping = classes_mapping
+        self.coco_evaluator = None
+        assert iou_type in ["bbox"]
+        self.iou_type = iou_type
+        self.results_file_name = results_file_name
+
+    def prepare_for_coco_detection(self, ann, output):
+        """将预测的结果转换成COCOeval指定的格式，针对目标检测任务"""
+        # 遍历每张图像的预测结果
+        if len(output[0]) == 0:
+            return
+
+        img_id = ann[2]["img_id"]
+        per_image_boxes = output[0]
+        per_image_boxes = scale_coords(img1_shape=ann[2]["shape"],
+                                       coords=per_image_boxes,
+                                       img0_shape=ann[2]["origin_shape"],
+                                       ratio_pad=ann[2]["ratio_pad"])
+        # 对于coco_eval, 需要的每个box的数据格式为[x_min, y_min, w, h]
+        # 而我们预测的box格式是[x_min, y_min, x_max, y_max]，所以需要转下格式
+        per_image_boxes[:, 2:] -= per_image_boxes[:, :2]
+        per_image_classes = output[1].tolist()
+        per_image_scores = output[2].tolist()
+
+        # 遍历每个目标的信息
+        for object_score, object_class, object_box in zip(
+                per_image_scores, per_image_classes, per_image_boxes):
+            object_score = float(object_score)
+            class_idx = int(object_class)
+            if self.classes_mapping is not None:
+                class_idx = self.classes_mapping[class_idx]
+            # We recommend rounding coordinates to the nearest tenth of a pixel
+            # to reduce resulting JSON file size.
+            object_box = [round(b, 2) for b in object_box.tolist()]
+
+            res = {"image_id": img_id,
+                   "category_id": class_idx,
+                   "bbox": object_box,
+                   "score": round(object_score, 3)}
+            self.results.append(res)
+
+    def update(self, targets, outputs):
+        if self.iou_type == "bbox":
+            self.prepare_for_coco_detection(targets, outputs)
+        else:
+            raise KeyError(f"not support iou_type: {self.iou_type}")
+
+    def evaluate(self):
+        # write predict results into json file
+        json_str = json.dumps(self.results, indent=4)
+        with open(self.results_file_name, 'w') as json_file:
+            json_file.write(json_str)
+
+        # accumulate predictions from all images
+        coco_true = self.coco
+        coco_pre = coco_true.loadRes(self.results_file_name)
+
+        self.coco_evaluator = COCOeval(cocoGt=coco_true, cocoDt=coco_pre, iouType=self.iou_type)
+
+        self.coco_evaluator.evaluate()
+        self.coco_evaluator.accumulate()
+        print(f"IoU metric: {self.iou_type}")
+        self.coco_evaluator.summarize()
+
+        coco_info = self.coco_evaluator.stats.tolist()  # numpy to list
+        return coco_info
+
diff --git a/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/compare_onnx_and_trt.py b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/compare_onnx_and_trt.py
new file mode 100644
index 000000000..a9293236b
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/compare_onnx_and_trt.py
@@ -0,0 +1,90 @@
+import numpy as np
+import tensorrt as trt
+import onnxruntime
+import pycuda.driver as cuda
+import pycuda.autoinit
+
+
+def normalize(image: np.ndarray) -> np.ndarray:
+    """
+    Normalize the image to the given mean and standard deviation
+    """
+    image = image.astype(np.float32)
+    mean = (0.485, 0.456, 0.406)
+    std = (0.229, 0.224, 0.225)
+    image /= 255.0
+    image -= mean
+    image /= std
+    return image
+
+
+def onnx_inference(onnx_path: str, image: np.ndarray):
+    # load onnx model
+    ort_session = onnxruntime.InferenceSession(onnx_path)
+
+    # compute onnx Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: image}
+    res_onnx = ort_session.run(None, ort_inputs)[0]
+    return res_onnx
+
+
+def trt_inference(trt_path: str, image: np.ndarray):
+    # Load the network in Inference Engine
+    trt_logger = trt.Logger(trt.Logger.WARNING)
+    with open(trt_path, "rb") as f, trt.Runtime(trt_logger) as runtime:
+        engine = runtime.deserialize_cuda_engine(f.read())
+
+    with engine.create_execution_context() as context:
+        # Set input shape based on image dimensions for inference
+        context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image.shape[-2], image.shape[-1]))
+        # Allocate host and device buffers
+        bindings = []
+        for binding in engine:
+            binding_idx = engine.get_binding_index(binding)
+            size = trt.volume(context.get_binding_shape(binding_idx))
+            dtype = trt.nptype(engine.get_binding_dtype(binding))
+            if engine.binding_is_input(binding):
+                input_buffer = np.ascontiguousarray(image)
+                input_memory = cuda.mem_alloc(image.nbytes)
+                bindings.append(int(input_memory))
+            else:
+                output_buffer = cuda.pagelocked_empty(size, dtype)
+                output_memory = cuda.mem_alloc(output_buffer.nbytes)
+                bindings.append(int(output_memory))
+
+        stream = cuda.Stream()
+        # Transfer input data to the GPU.
+        cuda.memcpy_htod_async(input_memory, input_buffer, stream)
+        # Run inference
+        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
+        # Transfer prediction output from the GPU.
+        cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
+        # Synchronize the stream
+        stream.synchronize()
+
+        res_trt = np.reshape(output_buffer, (1, -1))
+
+    return res_trt
+
+
+def main():
+    image_h = 224
+    image_w = 224
+    onnx_path = "resnet34.onnx"
+    trt_path = "trt_output/resnet34.trt"
+
+    image = np.random.randn(image_h, image_w, 3)
+    normalized_image = normalize(image)
+
+    # Convert the resized images to network input shape
+    # [h, w, c] -> [c, h, w] -> [1, c, h, w]
+    normalized_image = np.expand_dims(np.transpose(normalized_image, (2, 0, 1)), 0)
+
+    onnx_res = onnx_inference(onnx_path, normalized_image)
+    ir_res = trt_inference(trt_path, normalized_image)
+    np.testing.assert_allclose(onnx_res, ir_res, rtol=1e-03, atol=1e-05)
+    print("Exported model has been tested with TensorRT Runtime, and the result looks good!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/convert_pytorch2onnx.py b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/convert_pytorch2onnx.py
new file mode 100644
index 000000000..7dec1c402
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/convert_pytorch2onnx.py
@@ -0,0 +1,58 @@
+import torch
+import torch.onnx
+import onnx
+import onnxruntime
+import numpy as np
+from torchvision.models import resnet34
+
+device = torch.device("cpu")
+
+
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+
+def main():
+    weights_path = "resNet34(flower).pth"
+    onnx_file_name = "resnet34.onnx"
+    batch_size = 1
+    img_h = 224
+    img_w = 224
+    img_channel = 3
+
+    # create model and load pretrain weights
+    model = resnet34(pretrained=False, num_classes=5)
+    model.load_state_dict(torch.load(weights_path, map_location='cpu'))
+
+    model.eval()
+    # input to the model
+    # [batch, channel, height, width]
+    x = torch.rand(batch_size, img_channel, img_h, img_w, requires_grad=True)
+    torch_out = model(x)
+
+    # export the model
+    torch.onnx.export(model,             # model being run
+                      x,                 # model input (or a tuple for multiple inputs)
+                      onnx_file_name,    # where to save the model (can be a file or file-like object)
+                      input_names=["input"],
+                      output_names=["output"],
+                      verbose=False)
+
+    # check onnx model
+    onnx_model = onnx.load(onnx_file_name)
+    onnx.checker.check_model(onnx_model)
+
+    ort_session = onnxruntime.InferenceSession(onnx_file_name)
+
+    # compute ONNX Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
+    ort_outs = ort_session.run(None, ort_inputs)
+
+    # compare ONNX Runtime and Pytorch results
+    # assert_allclose: Raises an AssertionError if two objects are not equal up to desired tolerance.
+    np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)
+    print("Exported model has been tested with ONNXRuntime, and the result looks good!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/my_dataset.py b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/my_dataset.py
new file mode 100644
index 000000000..167bc9a30
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/my_dataset.py
@@ -0,0 +1,37 @@
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+
+
+class MyDataSet(Dataset):
+    """自定义数据集"""
+
+    def __init__(self, images_path: list, images_class: list, transform=None):
+        self.images_path = images_path
+        self.images_class = images_class
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.images_path)
+
+    def __getitem__(self, item):
+        img = Image.open(self.images_path[item])
+        # RGB为彩色图片，L为灰度图片
+        if img.mode != 'RGB':
+            raise ValueError("image: {} isn't RGB mode.".format(self.images_path[item]))
+        label = self.images_class[item]
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        return img, label
+
+    @staticmethod
+    def collate_fn(batch):
+        # 官方实现的default_collate可以参考
+        # https://github.com/pytorch/pytorch/blob/67b7e751e6b5931a9f45274653f4f653a4e6cdf6/torch/utils/data/_utils/collate.py
+        images, labels = tuple(zip(*batch))
+
+        images = torch.stack(images, dim=0)
+        labels = torch.as_tensor(labels)
+        return images, labels
diff --git a/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/quantization.py b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/quantization.py
new file mode 100644
index 000000000..6516ed744
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/quantization.py
@@ -0,0 +1,196 @@
+"""
+refer to:
+https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html
+"""
+import os
+import math
+import argparse
+
+from absl import logging
+from tqdm import tqdm
+import torch
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+from torchvision import transforms
+from torchvision.models.resnet import resnet34 as create_model
+from pytorch_quantization import nn as quant_nn
+from pytorch_quantization import quant_modules, calib
+from pytorch_quantization.tensor_quant import QuantDescriptor
+
+from my_dataset import MyDataSet
+from utils import read_split_data, train_one_epoch, evaluate
+
+logging.set_verbosity(logging.FATAL)
+
+
+def export_onnx(model, onnx_filename, onnx_bs):
+    model.eval()
+    # We have to shift to pytorch's fake quant ops before exporting the model to ONNX
+    quant_nn.TensorQuantizer.use_fb_fake_quant = True
+    opset_version = 13
+
+    print(f"Export ONNX file: {onnx_filename}")
+    dummy_input = torch.randn(onnx_bs, 3, 224, 224).cuda()
+    torch.onnx.export(model,
+                      dummy_input,
+                      onnx_filename,
+                      verbose=False,
+                      opset_version=opset_version,
+                      enable_onnx_checker=False,
+                      input_names=["input"],
+                      output_names=["output"])
+
+
+def collect_stats(model, data_loader, num_batches):
+    """Feed data to the network and collect statistic"""
+
+    # Enable calibrators
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer):
+            if module._calibrator is not None:
+                module.disable_quant()
+                module.enable_calib()
+            else:
+                module.disable()
+
+    for i, (images, _) in tqdm(enumerate(data_loader), total=num_batches):
+        model(images.cuda())
+        if i >= num_batches:
+            break
+
+    # Disable calibrators
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer):
+            if module._calibrator is not None:
+                module.enable_quant()
+                module.disable_calib()
+            else:
+                module.enable()
+
+
+def compute_amax(model, **kwargs):
+    # Load calib result
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer):
+            if module._calibrator is not None:
+                if isinstance(module._calibrator, calib.MaxCalibrator):
+                    module.load_calib_amax()
+                else:
+                    module.load_calib_amax(**kwargs)
+            print(f"{name:40}: {module}")
+    model.cuda()
+
+
+def main(args):
+    quant_modules.initialize()
+    assert torch.cuda.is_available(), "only support GPU!"
+
+    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(args.data_path)
+
+    data_transform = {
+        "train": transforms.Compose([transforms.RandomResizedCrop(224),
+                                     transforms.RandomHorizontalFlip(),
+                                     transforms.ToTensor(),
+                                     transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])]),
+        "val": transforms.Compose([transforms.Resize(256),
+                                   transforms.CenterCrop(224),
+                                   transforms.ToTensor(),
+                                   transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])}
+
+    # 实例化训练数据集
+    train_dataset = MyDataSet(images_path=train_images_path,
+                              images_class=train_images_label,
+                              transform=data_transform["train"])
+
+    # 实例化验证数据集
+    val_dataset = MyDataSet(images_path=val_images_path,
+                            images_class=val_images_label,
+                            transform=data_transform["val"])
+
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using {} dataloader workers every process'.format(nw))
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=batch_size,
+                                               shuffle=True,
+                                               pin_memory=True,
+                                               num_workers=nw,
+                                               collate_fn=train_dataset.collate_fn)
+
+    val_loader = torch.utils.data.DataLoader(val_dataset,
+                                             batch_size=batch_size,
+                                             shuffle=False,
+                                             pin_memory=True,
+                                             num_workers=nw,
+                                             collate_fn=val_dataset.collate_fn)
+
+    # ########################## #
+    # Post Training Quantization #
+    # ########################## #
+    # We will use histogram based calibration for activations and the default max calibration for weights.
+    quant_desc_input = QuantDescriptor(calib_method='histogram')
+    quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
+    quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
+
+    model = create_model(num_classes=args.num_classes)
+    assert os.path.exists(args.weights), "weights file: '{}' not exist.".format(args.weights)
+    model.load_state_dict(torch.load(args.weights, map_location='cpu'))
+    model.cuda()
+
+    # It is a bit slow since we collect histograms on CPU
+    with torch.no_grad():
+        collect_stats(model, val_loader, num_batches=1000 // batch_size)
+        compute_amax(model, method="percentile", percentile=99.99)
+        # validate
+        evaluate(model=model, data_loader=val_loader, epoch=0)
+
+    torch.save(model.state_dict(), "quant_model_calibrated.pth")
+
+    if args.qat:
+        # ########################### #
+        # Quantization Aware Training #
+        # ########################### #
+        pg = [p for p in model.parameters() if p.requires_grad]
+        optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=5E-5)
+        # Scheduler(half of a cosine period)
+        lf = lambda x: (math.cos(x * math.pi / 2 / args.epochs)) * (1 - args.lrf) + args.lrf
+        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+
+        for epoch in range(args.epochs):
+            # train
+            train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, epoch=epoch)
+
+            scheduler.step()
+
+            # validate
+            evaluate(model=model, data_loader=val_loader, epoch=epoch)
+
+    export_onnx(model, args.onnx_filename, args.onnx_bs)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_classes', type=int, default=5)
+    parser.add_argument('--epochs', type=int, default=5)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--lr', type=float, default=0.0001)
+    parser.add_argument('--lrf', type=float, default=0.01)
+
+    # 数据集所在根目录
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
+    parser.add_argument('--data-path', type=str,
+                        default="/data/flower_photos")
+
+    # 训练好的权重路径
+    parser.add_argument('--weights', type=str, default='./resNet(flower).pth',
+                        help='trained weights path')
+
+    parser.add_argument('--device', default='cuda:0', help='device id (i.e. 0 or 0,1 or cpu)')
+
+    parser.add_argument('--onnx-filename', default='resnet34.onnx', help='save onnx model filename')
+    parser.add_argument('--onnx-bs', default=1, help='save onnx model batch size')
+    parser.add_argument('--qat', type=bool, default=True, help='whether use quantization aware training')
+
+    opt = parser.parse_args()
+
+    main(opt)
diff --git a/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/utils.py b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/utils.py
new file mode 100644
index 000000000..309c32675
--- /dev/null
+++ b/deploying_service/deploying_pytorch/convert_tensorrt/convert_resnet34/utils.py
@@ -0,0 +1,131 @@
+import os
+import sys
+import json
+import pickle
+import random
+
+import torch
+from tqdm import tqdm
+
+
+def read_split_data(root: str, val_rate: float = 0.2):
+    random.seed(0)  # 保证随机结果可复现
+    assert os.path.exists(root), "dataset root: {} does not exist.".format(root)
+
+    # 遍历文件夹，一个文件夹对应一个类别
+    flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
+    # 排序，保证顺序一致
+    flower_class.sort()
+    # 生成类别名称以及对应的数字索引
+    class_indices = dict((k, v) for v, k in enumerate(flower_class))
+    json_str = json.dumps(dict((val, key) for key, val in class_indices.items()), indent=4)
+    with open('class_indices.json', 'w') as json_file:
+        json_file.write(json_str)
+
+    train_images_path = []  # 存储训练集的所有图片路径
+    train_images_label = []  # 存储训练集图片对应索引信息
+    val_images_path = []  # 存储验证集的所有图片路径
+    val_images_label = []  # 存储验证集图片对应索引信息
+    every_class_num = []  # 存储每个类别的样本总数
+    supported = [".jpg", ".JPG", ".png", ".PNG"]  # 支持的文件后缀类型
+    # 遍历每个文件夹下的文件
+    for cla in flower_class:
+        cla_path = os.path.join(root, cla)
+        # 遍历获取supported支持的所有文件路径
+        images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
+                  if os.path.splitext(i)[-1] in supported]
+        # 获取该类别对应的索引
+        image_class = class_indices[cla]
+        # 记录该类别的样本数量
+        every_class_num.append(len(images))
+        # 按比例随机采样验证样本
+        val_path = random.sample(images, k=int(len(images) * val_rate))
+
+        for img_path in images:
+            if img_path in val_path:  # 如果该路径在采样的验证集样本中则存入验证集
+                val_images_path.append(img_path)
+                val_images_label.append(image_class)
+            else:  # 否则存入训练集
+                train_images_path.append(img_path)
+                train_images_label.append(image_class)
+
+    print("{} images were found in the dataset.".format(sum(every_class_num)))
+    print("{} images for training.".format(len(train_images_path)))
+    print("{} images for validation.".format(len(val_images_path)))
+
+    return train_images_path, train_images_label, val_images_path, val_images_label
+
+
+def write_pickle(list_info: list, file_name: str):
+    with open(file_name, 'wb') as f:
+        pickle.dump(list_info, f)
+
+
+def read_pickle(file_name: str) -> list:
+    with open(file_name, 'rb') as f:
+        info_list = pickle.load(f)
+        return info_list
+
+
+def train_one_epoch(model, optimizer, data_loader, epoch):
+    model.train()
+    loss_function = torch.nn.CrossEntropyLoss()
+    accu_loss = torch.zeros(1).cuda()  # 累计损失
+    accu_num = torch.zeros(1).cuda()   # 累计预测正确的样本数
+    optimizer.zero_grad()
+
+    sample_num = 0
+    data_loader = tqdm(data_loader, file=sys.stdout)
+    for step, data in enumerate(data_loader):
+        images, labels = data
+        sample_num += images.shape[0]
+
+        pred = model(images.cuda())
+        pred_classes = torch.max(pred, dim=1)[1]
+        accu_num += torch.eq(pred_classes, labels.cuda()).sum()
+
+        loss = loss_function(pred, labels.cuda())
+        loss.backward()
+        accu_loss += loss.detach()
+
+        data_loader.desc = "[train epoch {}] loss: {:.3f}, acc: {:.3f}".format(epoch,
+                                                                               accu_loss.item() / (step + 1),
+                                                                               accu_num.item() / sample_num)
+
+        if not torch.isfinite(loss):
+            print('WARNING: non-finite loss, ending training ', loss)
+            sys.exit(1)
+
+        optimizer.step()
+        optimizer.zero_grad()
+
+    return accu_loss.item() / (step + 1), accu_num.item() / sample_num
+
+
+@torch.no_grad()
+def evaluate(model, data_loader, epoch):
+    loss_function = torch.nn.CrossEntropyLoss()
+
+    model.eval()
+
+    accu_num = torch.zeros(1).cuda()   # 累计预测正确的样本数
+    accu_loss = torch.zeros(1).cuda()  # 累计损失
+
+    sample_num = 0
+    data_loader = tqdm(data_loader, file=sys.stdout)
+    for step, data in enumerate(data_loader):
+        images, labels = data
+        sample_num += images.shape[0]
+
+        pred = model(images.cuda())
+        pred_classes = torch.max(pred, dim=1)[1]
+        accu_num += torch.eq(pred_classes, labels.cuda()).sum()
+
+        loss = loss_function(pred, labels.cuda())
+        accu_loss += loss
+
+        data_loader.desc = "[valid epoch {}] loss: {:.3f}, acc: {:.3f}".format(epoch,
+                                                                               accu_loss.item() / (step + 1),
+                                                                               accu_num.item() / sample_num)
+
+    return accu_loss.item() / (step + 1), accu_num.item() / sample_num
diff --git a/deploying_service/deploying_pytorch/pytorch_flask_service/main.py b/deploying_service/deploying_pytorch/pytorch_flask_service/main.py
index 974d2a453..2f25d6d0d 100644
--- a/deploying_service/deploying_pytorch/pytorch_flask_service/main.py
+++ b/deploying_service/deploying_pytorch/pytorch_flask_service/main.py
@@ -20,10 +20,10 @@
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(device)
 # create model
-model = MobileNetV2(num_classes=5)
+model = MobileNetV2(num_classes=5).to(device)
 # load model weights
 model.load_state_dict(torch.load(weights_path, map_location=device))
-model.to(device)
+
 model.eval()
 
 # load class info
diff --git a/deploying_service/deploying_pytorch/pytorch_flask_service/requirements.txt b/deploying_service/deploying_pytorch/pytorch_flask_service/requirements.txt
index 83b476f73..bdbbb72cd 100644
--- a/deploying_service/deploying_pytorch/pytorch_flask_service/requirements.txt
+++ b/deploying_service/deploying_pytorch/pytorch_flask_service/requirements.txt
@@ -1,3 +1,3 @@
-Flask==1.1.1
+Flask==2.2.5
 Flask_Cors==3.0.9
 Pillow
diff --git a/pytorch_classification/ConvNeXt/README.md b/pytorch_classification/ConvNeXt/README.md
new file mode 100644
index 000000000..c93d9df0e
--- /dev/null
+++ b/pytorch_classification/ConvNeXt/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，在`model.py`文件中每个模型都有提供预训练权重的下载地址，根据自己使用的模型下载对应预训练权重
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/ConvNeXt/model.py b/pytorch_classification/ConvNeXt/model.py
new file mode 100644
index 000000000..6e8337c2b
--- /dev/null
+++ b/pytorch_classification/ConvNeXt/model.py
@@ -0,0 +1,212 @@
+"""
+original code from facebook research:
+https://github.com/facebookresearch/ConvNeXt
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape), requires_grad=True)
+        self.bias = nn.Parameter(torch.zeros(normalized_shape), requires_grad=True)
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise ValueError(f"not support data format '{self.data_format}'")
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            # [batch_size, channels, height, width]
+            mean = x.mean(1, keepdim=True)
+            var = (x - mean).pow(2).mean(1, keepdim=True)
+            x = (x - mean) / torch.sqrt(var + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_rate (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_rate=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6, data_format="channels_last")
+        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim,)),
+                                  requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_rate) if drop_rate > 0. else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # [N, C, H, W] -> [N, H, W, C]
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # [N, H, W, C] -> [N, C, H, W]
+
+        x = shortcut + self.drop_path(x)
+        return x
+
+
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(self, in_chans: int = 3, num_classes: int = 1000, depths: list = None,
+                 dims: list = None, drop_path_rate: float = 0., layer_scale_init_value: float = 1e-6,
+                 head_init_scale: float = 1.):
+        super().__init__()
+        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+                             LayerNorm(dims[0], eps=1e-6, data_format="channels_first"))
+        self.downsample_layers.append(stem)
+
+        # 对应stage2-stage4前的3个downsample
+        for i in range(3):
+            downsample_layer = nn.Sequential(LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                                             nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2))
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.ModuleList()  # 4 feature resolution stages, each consisting of multiple blocks
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        # 构建每个stage中堆叠的block
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_rate=dp_rates[cur + j], layer_scale_init_value=layer_scale_init_value)
+                  for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer
+        self.head = nn.Linear(dims[-1], num_classes)
+        self.apply(self._init_weights)
+        self.head.weight.data.mul_(head_init_scale)
+        self.head.bias.data.mul_(head_init_scale)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.2)
+            nn.init.constant_(m.bias, 0)
+
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+
+        return self.norm(x.mean([-2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def convnext_tiny(num_classes: int):
+    # https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth
+    model = ConvNeXt(depths=[3, 3, 9, 3],
+                     dims=[96, 192, 384, 768],
+                     num_classes=num_classes)
+    return model
+
+
+def convnext_small(num_classes: int):
+    # https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth
+    model = ConvNeXt(depths=[3, 3, 27, 3],
+                     dims=[96, 192, 384, 768],
+                     num_classes=num_classes)
+    return model
+
+
+def convnext_base(num_classes: int):
+    # https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth
+    # https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth
+    model = ConvNeXt(depths=[3, 3, 27, 3],
+                     dims=[128, 256, 512, 1024],
+                     num_classes=num_classes)
+    return model
+
+
+def convnext_large(num_classes: int):
+    # https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth
+    # https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth
+    model = ConvNeXt(depths=[3, 3, 27, 3],
+                     dims=[192, 384, 768, 1536],
+                     num_classes=num_classes)
+    return model
+
+
+def convnext_xlarge(num_classes: int):
+    # https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth
+    model = ConvNeXt(depths=[3, 3, 27, 3],
+                     dims=[256, 512, 1024, 2048],
+                     num_classes=num_classes)
+    return model
diff --git a/pytorch_classification/ConvNeXt/my_dataset.py b/pytorch_classification/ConvNeXt/my_dataset.py
new file mode 100644
index 000000000..167bc9a30
--- /dev/null
+++ b/pytorch_classification/ConvNeXt/my_dataset.py
@@ -0,0 +1,37 @@
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+
+
+class MyDataSet(Dataset):
+    """自定义数据集"""
+
+    def __init__(self, images_path: list, images_class: list, transform=None):
+        self.images_path = images_path
+        self.images_class = images_class
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.images_path)
+
+    def __getitem__(self, item):
+        img = Image.open(self.images_path[item])
+        # RGB为彩色图片，L为灰度图片
+        if img.mode != 'RGB':
+            raise ValueError("image: {} isn't RGB mode.".format(self.images_path[item]))
+        label = self.images_class[item]
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        return img, label
+
+    @staticmethod
+    def collate_fn(batch):
+        # 官方实现的default_collate可以参考
+        # https://github.com/pytorch/pytorch/blob/67b7e751e6b5931a9f45274653f4f653a4e6cdf6/torch/utils/data/_utils/collate.py
+        images, labels = tuple(zip(*batch))
+
+        images = torch.stack(images, dim=0)
+        labels = torch.as_tensor(labels)
+        return images, labels
diff --git a/pytorch_classification/ConvNeXt/predict.py b/pytorch_classification/ConvNeXt/predict.py
new file mode 100644
index 000000000..a603b22e7
--- /dev/null
+++ b/pytorch_classification/ConvNeXt/predict.py
@@ -0,0 +1,63 @@
+import os
+import json
+
+import torch
+from PIL import Image
+from torchvision import transforms
+import matplotlib.pyplot as plt
+
+from model import convnext_tiny as create_model
+
+
+def main():
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print(f"using {device} device.")
+
+    num_classes = 5
+    img_size = 224
+    data_transform = transforms.Compose(
+        [transforms.Resize(int(img_size * 1.14)),
+         transforms.CenterCrop(img_size),
+         transforms.ToTensor(),
+         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
+
+    # load image
+    img_path = "../tulip.jpg"
+    assert os.path.exists(img_path), "file: '{}' dose not exist.".format(img_path)
+    img = Image.open(img_path)
+    plt.imshow(img)
+    # [N, C, H, W]
+    img = data_transform(img)
+    # expand batch dimension
+    img = torch.unsqueeze(img, dim=0)
+
+    # read class_indict
+    json_path = './class_indices.json'
+    assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
+
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
+
+    # create model
+    model = create_model(num_classes=num_classes).to(device)
+    # load model weights
+    model_weight_path = "./weights/best_model.pth"
+    model.load_state_dict(torch.load(model_weight_path, map_location=device))
+    model.eval()
+    with torch.no_grad():
+        # predict class
+        output = torch.squeeze(model(img.to(device))).cpu()
+        predict = torch.softmax(output, dim=0)
+        predict_cla = torch.argmax(predict).numpy()
+
+    print_res = "class: {}   prob: {:.3}".format(class_indict[str(predict_cla)],
+                                                 predict[predict_cla].numpy())
+    plt.title(print_res)
+    for i in range(len(predict)):
+        print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
+                                                  predict[i].numpy()))
+    plt.show()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pytorch_classification/ConvNeXt/train.py b/pytorch_classification/ConvNeXt/train.py
new file mode 100644
index 000000000..cbec65967
--- /dev/null
+++ b/pytorch_classification/ConvNeXt/train.py
@@ -0,0 +1,139 @@
+import os
+import argparse
+
+import torch
+import torch.optim as optim
+from torch.utils.tensorboard import SummaryWriter
+from torchvision import transforms
+
+from my_dataset import MyDataSet
+from model import convnext_tiny as create_model
+from utils import read_split_data, create_lr_scheduler, get_params_groups, train_one_epoch, evaluate
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    print(f"using {device} device.")
+
+    if os.path.exists("./weights") is False:
+        os.makedirs("./weights")
+
+    tb_writer = SummaryWriter()
+
+    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(args.data_path)
+
+    img_size = 224
+    data_transform = {
+        "train": transforms.Compose([transforms.RandomResizedCrop(img_size),
+                                     transforms.RandomHorizontalFlip(),
+                                     transforms.ToTensor(),
+                                     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
+        "val": transforms.Compose([transforms.Resize(int(img_size * 1.143)),
+                                   transforms.CenterCrop(img_size),
+                                   transforms.ToTensor(),
+                                   transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])}
+
+    # 实例化训练数据集
+    train_dataset = MyDataSet(images_path=train_images_path,
+                              images_class=train_images_label,
+                              transform=data_transform["train"])
+
+    # 实例化验证数据集
+    val_dataset = MyDataSet(images_path=val_images_path,
+                            images_class=val_images_label,
+                            transform=data_transform["val"])
+
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using {} dataloader workers every process'.format(nw))
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=batch_size,
+                                               shuffle=True,
+                                               pin_memory=True,
+                                               num_workers=nw,
+                                               collate_fn=train_dataset.collate_fn)
+
+    val_loader = torch.utils.data.DataLoader(val_dataset,
+                                             batch_size=batch_size,
+                                             shuffle=False,
+                                             pin_memory=True,
+                                             num_workers=nw,
+                                             collate_fn=val_dataset.collate_fn)
+
+    model = create_model(num_classes=args.num_classes).to(device)
+
+    if args.weights != "":
+        assert os.path.exists(args.weights), "weights file: '{}' not exist.".format(args.weights)
+        weights_dict = torch.load(args.weights, map_location=device)["model"]
+        # 删除有关分类类别的权重
+        for k in list(weights_dict.keys()):
+            if "head" in k:
+                del weights_dict[k]
+        print(model.load_state_dict(weights_dict, strict=False))
+
+    if args.freeze_layers:
+        for name, para in model.named_parameters():
+            # 除head外，其他权重全部冻结
+            if "head" not in name:
+                para.requires_grad_(False)
+            else:
+                print("training {}".format(name))
+
+    # pg = [p for p in model.parameters() if p.requires_grad]
+    pg = get_params_groups(model, weight_decay=args.wd)
+    optimizer = optim.AdamW(pg, lr=args.lr, weight_decay=args.wd)
+    lr_scheduler = create_lr_scheduler(optimizer, len(train_loader), args.epochs,
+                                       warmup=True, warmup_epochs=1)
+
+    best_acc = 0.
+    for epoch in range(args.epochs):
+        # train
+        train_loss, train_acc = train_one_epoch(model=model,
+                                                optimizer=optimizer,
+                                                data_loader=train_loader,
+                                                device=device,
+                                                epoch=epoch,
+                                                lr_scheduler=lr_scheduler)
+
+        # validate
+        val_loss, val_acc = evaluate(model=model,
+                                     data_loader=val_loader,
+                                     device=device,
+                                     epoch=epoch)
+
+        tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
+        tb_writer.add_scalar(tags[0], train_loss, epoch)
+        tb_writer.add_scalar(tags[1], train_acc, epoch)
+        tb_writer.add_scalar(tags[2], val_loss, epoch)
+        tb_writer.add_scalar(tags[3], val_acc, epoch)
+        tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)
+
+        if best_acc < val_acc:
+            torch.save(model.state_dict(), "./weights/best_model.pth")
+            best_acc = val_acc
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_classes', type=int, default=5)
+    parser.add_argument('--epochs', type=int, default=10)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--lr', type=float, default=5e-4)
+    parser.add_argument('--wd', type=float, default=5e-2)
+
+    # 数据集所在根目录
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
+    parser.add_argument('--data-path', type=str,
+                        default="/data/flower_photos")
+
+    # 预训练权重路径，如果不想载入就设置为空字符
+    # 链接: https://pan.baidu.com/s/1aNqQW4n_RrUlWUBNlaJRHA  密码: i83t
+    parser.add_argument('--weights', type=str, default='./convnext_tiny_1k_224_ema.pth',
+                        help='initial weights path')
+    # 是否冻结head以外所有权重
+    parser.add_argument('--freeze-layers', type=bool, default=False)
+    parser.add_argument('--device', default='cuda:0', help='device id (i.e. 0 or 0,1 or cpu)')
+
+    opt = parser.parse_args()
+
+    main(opt)
diff --git a/pytorch_classification/ConvNeXt/utils.py b/pytorch_classification/ConvNeXt/utils.py
new file mode 100644
index 000000000..c2bcb5594
--- /dev/null
+++ b/pytorch_classification/ConvNeXt/utils.py
@@ -0,0 +1,241 @@
+import os
+import sys
+import json
+import pickle
+import random
+import math
+
+import torch
+from tqdm import tqdm
+
+import matplotlib.pyplot as plt
+
+
+def read_split_data(root: str, val_rate: float = 0.2):
+    random.seed(0)  # 保证随机结果可复现
+    assert os.path.exists(root), "dataset root: {} does not exist.".format(root)
+
+    # 遍历文件夹，一个文件夹对应一个类别
+    flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
+    # 排序，保证各平台顺序一致
+    flower_class.sort()
+    # 生成类别名称以及对应的数字索引
+    class_indices = dict((k, v) for v, k in enumerate(flower_class))
+    json_str = json.dumps(dict((val, key) for key, val in class_indices.items()), indent=4)
+    with open('class_indices.json', 'w') as json_file:
+        json_file.write(json_str)
+
+    train_images_path = []  # 存储训练集的所有图片路径
+    train_images_label = []  # 存储训练集图片对应索引信息
+    val_images_path = []  # 存储验证集的所有图片路径
+    val_images_label = []  # 存储验证集图片对应索引信息
+    every_class_num = []  # 存储每个类别的样本总数
+    supported = [".jpg", ".JPG", ".png", ".PNG"]  # 支持的文件后缀类型
+    # 遍历每个文件夹下的文件
+    for cla in flower_class:
+        cla_path = os.path.join(root, cla)
+        # 遍历获取supported支持的所有文件路径
+        images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
+                  if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
+        # 获取该类别对应的索引
+        image_class = class_indices[cla]
+        # 记录该类别的样本数量
+        every_class_num.append(len(images))
+        # 按比例随机采样验证样本
+        val_path = random.sample(images, k=int(len(images) * val_rate))
+
+        for img_path in images:
+            if img_path in val_path:  # 如果该路径在采样的验证集样本中则存入验证集
+                val_images_path.append(img_path)
+                val_images_label.append(image_class)
+            else:  # 否则存入训练集
+                train_images_path.append(img_path)
+                train_images_label.append(image_class)
+
+    print("{} images were found in the dataset.".format(sum(every_class_num)))
+    print("{} images for training.".format(len(train_images_path)))
+    print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
+
+    plot_image = False
+    if plot_image:
+        # 绘制每种类别个数柱状图
+        plt.bar(range(len(flower_class)), every_class_num, align='center')
+        # 将横坐标0,1,2,3,4替换为相应的类别名称
+        plt.xticks(range(len(flower_class)), flower_class)
+        # 在柱状图上添加数值标签
+        for i, v in enumerate(every_class_num):
+            plt.text(x=i, y=v + 5, s=str(v), ha='center')
+        # 设置x坐标
+        plt.xlabel('image class')
+        # 设置y坐标
+        plt.ylabel('number of images')
+        # 设置柱状图的标题
+        plt.title('flower class distribution')
+        plt.show()
+
+    return train_images_path, train_images_label, val_images_path, val_images_label
+
+
+def plot_data_loader_image(data_loader):
+    batch_size = data_loader.batch_size
+    plot_num = min(batch_size, 4)
+
+    json_path = './class_indices.json'
+    assert os.path.exists(json_path), json_path + " does not exist."
+    json_file = open(json_path, 'r')
+    class_indices = json.load(json_file)
+
+    for data in data_loader:
+        images, labels = data
+        for i in range(plot_num):
+            # [C, H, W] -> [H, W, C]
+            img = images[i].numpy().transpose(1, 2, 0)
+            # 反Normalize操作
+            img = (img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]) * 255
+            label = labels[i].item()
+            plt.subplot(1, plot_num, i+1)
+            plt.xlabel(class_indices[str(label)])
+            plt.xticks([])  # 去掉x轴的刻度
+            plt.yticks([])  # 去掉y轴的刻度
+            plt.imshow(img.astype('uint8'))
+        plt.show()
+
+
+def write_pickle(list_info: list, file_name: str):
+    with open(file_name, 'wb') as f:
+        pickle.dump(list_info, f)
+
+
+def read_pickle(file_name: str) -> list:
+    with open(file_name, 'rb') as f:
+        info_list = pickle.load(f)
+        return info_list
+
+
+def train_one_epoch(model, optimizer, data_loader, device, epoch, lr_scheduler):
+    model.train()
+    loss_function = torch.nn.CrossEntropyLoss()
+    accu_loss = torch.zeros(1).to(device)  # 累计损失
+    accu_num = torch.zeros(1).to(device)   # 累计预测正确的样本数
+    optimizer.zero_grad()
+
+    sample_num = 0
+    data_loader = tqdm(data_loader, file=sys.stdout)
+    for step, data in enumerate(data_loader):
+        images, labels = data
+        sample_num += images.shape[0]
+
+        pred = model(images.to(device))
+        pred_classes = torch.max(pred, dim=1)[1]
+        accu_num += torch.eq(pred_classes, labels.to(device)).sum()
+
+        loss = loss_function(pred, labels.to(device))
+        loss.backward()
+        accu_loss += loss.detach()
+
+        data_loader.desc = "[train epoch {}] loss: {:.3f}, acc: {:.3f}, lr: {:.5f}".format(
+            epoch,
+            accu_loss.item() / (step + 1),
+            accu_num.item() / sample_num,
+            optimizer.param_groups[0]["lr"]
+        )
+
+        if not torch.isfinite(loss):
+            print('WARNING: non-finite loss, ending training ', loss)
+            sys.exit(1)
+
+        optimizer.step()
+        optimizer.zero_grad()
+        # update lr
+        lr_scheduler.step()
+
+    return accu_loss.item() / (step + 1), accu_num.item() / sample_num
+
+
+@torch.no_grad()
+def evaluate(model, data_loader, device, epoch):
+    loss_function = torch.nn.CrossEntropyLoss()
+
+    model.eval()
+
+    accu_num = torch.zeros(1).to(device)   # 累计预测正确的样本数
+    accu_loss = torch.zeros(1).to(device)  # 累计损失
+
+    sample_num = 0
+    data_loader = tqdm(data_loader, file=sys.stdout)
+    for step, data in enumerate(data_loader):
+        images, labels = data
+        sample_num += images.shape[0]
+
+        pred = model(images.to(device))
+        pred_classes = torch.max(pred, dim=1)[1]
+        accu_num += torch.eq(pred_classes, labels.to(device)).sum()
+
+        loss = loss_function(pred, labels.to(device))
+        accu_loss += loss
+
+        data_loader.desc = "[valid epoch {}] loss: {:.3f}, acc: {:.3f}".format(
+            epoch,
+            accu_loss.item() / (step + 1),
+            accu_num.item() / sample_num
+        )
+
+    return accu_loss.item() / (step + 1), accu_num.item() / sample_num
+
+
+def create_lr_scheduler(optimizer,
+                        num_step: int,
+                        epochs: int,
+                        warmup=True,
+                        warmup_epochs=1,
+                        warmup_factor=1e-3,
+                        end_factor=1e-6):
+    assert num_step > 0 and epochs > 0
+    if warmup is False:
+        warmup_epochs = 0
+
+    def f(x):
+        """
+        根据step数返回一个学习率倍率因子，
+        注意在训练开始之前，pytorch会提前调用一次lr_scheduler.step()方法
+        """
+        if warmup is True and x <= (warmup_epochs * num_step):
+            alpha = float(x) / (warmup_epochs * num_step)
+            # warmup过程中lr倍率因子从warmup_factor -> 1
+            return warmup_factor * (1 - alpha) + alpha
+        else:
+            current_step = (x - warmup_epochs * num_step)
+            cosine_steps = (epochs - warmup_epochs) * num_step
+            # warmup后lr倍率因子从1 -> end_factor
+            return ((1 + math.cos(current_step * math.pi / cosine_steps)) / 2) * (1 - end_factor) + end_factor
+
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)
+
+
+def get_params_groups(model: torch.nn.Module, weight_decay: float = 1e-5):
+    # 记录optimize要训练的权重参数
+    parameter_group_vars = {"decay": {"params": [], "weight_decay": weight_decay},
+                            "no_decay": {"params": [], "weight_decay": 0.}}
+
+    # 记录对应的权重名称
+    parameter_group_names = {"decay": {"params": [], "weight_decay": weight_decay},
+                             "no_decay": {"params": [], "weight_decay": 0.}}
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+
+        if len(param.shape) == 1 or name.endswith(".bias"):
+            group_name = "no_decay"
+        else:
+            group_name = "decay"
+
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+
+    print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values())
diff --git a/pytorch_classification/MobileViT/README.md b/pytorch_classification/MobileViT/README.md
new file mode 100644
index 000000000..c93d9df0e
--- /dev/null
+++ b/pytorch_classification/MobileViT/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，在`model.py`文件中每个模型都有提供预训练权重的下载地址，根据自己使用的模型下载对应预训练权重
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/MobileViT/model.py b/pytorch_classification/MobileViT/model.py
new file mode 100644
index 000000000..1606f1b69
--- /dev/null
+++ b/pytorch_classification/MobileViT/model.py
@@ -0,0 +1,562 @@
+"""
+original code from apple:
+https://github.com/apple/ml-cvnets/blob/main/cvnets/models/classification/mobilevit.py
+"""
+
+from typing import Optional, Tuple, Union, Dict
+import math
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+
+from transformer import TransformerEncoder
+from model_config import get_config
+
+
+def make_divisible(
+    v: Union[float, int],
+    divisor: Optional[int] = 8,
+    min_value: Optional[Union[float, int]] = None,
+) -> Union[float, int]:
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvLayer(nn.Module):
+    """
+    Applies a 2D convolution over an input
+
+    Args:
+        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H_{in}, W_{in})`
+        out_channels (int): :math:`C_{out}` from an expected output of size :math:`(N, C_{out}, H_{out}, W_{out})`
+        kernel_size (Union[int, Tuple[int, int]]): Kernel size for convolution.
+        stride (Union[int, Tuple[int, int]]): Stride for convolution. Default: 1
+        groups (Optional[int]): Number of groups in convolution. Default: 1
+        bias (Optional[bool]): Use bias. Default: ``False``
+        use_norm (Optional[bool]): Use normalization layer after convolution. Default: ``True``
+        use_act (Optional[bool]): Use activation layer after convolution (or convolution and normalization).
+                                Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+    .. note::
+        For depth-wise convolution, `groups=C_{in}=C_{out}`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Optional[Union[int, Tuple[int, int]]] = 1,
+        groups: Optional[int] = 1,
+        bias: Optional[bool] = False,
+        use_norm: Optional[bool] = True,
+        use_act: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+
+        if isinstance(stride, int):
+            stride = (stride, stride)
+
+        assert isinstance(kernel_size, Tuple)
+        assert isinstance(stride, Tuple)
+
+        padding = (
+            int((kernel_size[0] - 1) / 2),
+            int((kernel_size[1] - 1) / 2),
+        )
+
+        block = nn.Sequential()
+
+        conv_layer = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=groups,
+            padding=padding,
+            bias=bias
+        )
+
+        block.add_module(name="conv", module=conv_layer)
+
+        if use_norm:
+            norm_layer = nn.BatchNorm2d(num_features=out_channels, momentum=0.1)
+            block.add_module(name="norm", module=norm_layer)
+
+        if use_act:
+            act_layer = nn.SiLU()
+            block.add_module(name="act", module=act_layer)
+
+        self.block = block
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.block(x)
+
+
+class InvertedResidual(nn.Module):
+    """
+    This class implements the inverted residual block, as described in `MobileNetv2 <https://arxiv.org/abs/1801.04381>`_ paper
+
+    Args:
+        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H_{in}, W_{in})`
+        out_channels (int): :math:`C_{out}` from an expected output of size :math:`(N, C_{out}, H_{out}, W_{out)`
+        stride (int): Use convolutions with a stride. Default: 1
+        expand_ratio (Union[int, float]): Expand the input channels by this factor in depth-wise conv
+        skip_connection (Optional[bool]): Use skip-connection. Default: True
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+    .. note::
+        If `in_channels =! out_channels` and `stride > 1`, we set `skip_connection=False`
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        stride: int,
+        expand_ratio: Union[int, float],
+        skip_connection: Optional[bool] = True,
+    ) -> None:
+        assert stride in [1, 2]
+        hidden_dim = make_divisible(int(round(in_channels * expand_ratio)), 8)
+
+        super().__init__()
+
+        block = nn.Sequential()
+        if expand_ratio != 1:
+            block.add_module(
+                name="exp_1x1",
+                module=ConvLayer(
+                    in_channels=in_channels,
+                    out_channels=hidden_dim,
+                    kernel_size=1
+                ),
+            )
+
+        block.add_module(
+            name="conv_3x3",
+            module=ConvLayer(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                stride=stride,
+                kernel_size=3,
+                groups=hidden_dim
+            ),
+        )
+
+        block.add_module(
+            name="red_1x1",
+            module=ConvLayer(
+                in_channels=hidden_dim,
+                out_channels=out_channels,
+                kernel_size=1,
+                use_act=False,
+                use_norm=True,
+            ),
+        )
+
+        self.block = block
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.exp = expand_ratio
+        self.stride = stride
+        self.use_res_connect = (
+            self.stride == 1 and in_channels == out_channels and skip_connection
+        )
+
+    def forward(self, x: Tensor, *args, **kwargs) -> Tensor:
+        if self.use_res_connect:
+            return x + self.block(x)
+        else:
+            return self.block(x)
+
+
+class MobileViTBlock(nn.Module):
+    """
+    This class defines the `MobileViT block <https://arxiv.org/abs/2110.02178?context=cs.LG>`_
+
+    Args:
+        opts: command line arguments
+        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H, W)`
+        transformer_dim (int): Input dimension to the transformer unit
+        ffn_dim (int): Dimension of the FFN block
+        n_transformer_blocks (int): Number of transformer blocks. Default: 2
+        head_dim (int): Head dimension in the multi-head attention. Default: 32
+        attn_dropout (float): Dropout in multi-head attention. Default: 0.0
+        dropout (float): Dropout rate. Default: 0.0
+        ffn_dropout (float): Dropout between FFN layers in transformer. Default: 0.0
+        patch_h (int): Patch height for unfolding operation. Default: 8
+        patch_w (int): Patch width for unfolding operation. Default: 8
+        transformer_norm_layer (Optional[str]): Normalization layer in the transformer block. Default: layer_norm
+        conv_ksize (int): Kernel size to learn local representations in MobileViT block. Default: 3
+        no_fusion (Optional[bool]): Do not combine the input and output feature maps. Default: False
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        transformer_dim: int,
+        ffn_dim: int,
+        n_transformer_blocks: int = 2,
+        head_dim: int = 32,
+        attn_dropout: float = 0.0,
+        dropout: float = 0.0,
+        ffn_dropout: float = 0.0,
+        patch_h: int = 8,
+        patch_w: int = 8,
+        conv_ksize: Optional[int] = 3,
+        *args,
+        **kwargs
+    ) -> None:
+        super().__init__()
+
+        conv_3x3_in = ConvLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=conv_ksize,
+            stride=1
+        )
+        conv_1x1_in = ConvLayer(
+            in_channels=in_channels,
+            out_channels=transformer_dim,
+            kernel_size=1,
+            stride=1,
+            use_norm=False,
+            use_act=False
+        )
+
+        conv_1x1_out = ConvLayer(
+            in_channels=transformer_dim,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1
+        )
+        conv_3x3_out = ConvLayer(
+            in_channels=2 * in_channels,
+            out_channels=in_channels,
+            kernel_size=conv_ksize,
+            stride=1
+        )
+
+        self.local_rep = nn.Sequential()
+        self.local_rep.add_module(name="conv_3x3", module=conv_3x3_in)
+        self.local_rep.add_module(name="conv_1x1", module=conv_1x1_in)
+
+        assert transformer_dim % head_dim == 0
+        num_heads = transformer_dim // head_dim
+
+        global_rep = [
+            TransformerEncoder(
+                embed_dim=transformer_dim,
+                ffn_latent_dim=ffn_dim,
+                num_heads=num_heads,
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout
+            )
+            for _ in range(n_transformer_blocks)
+        ]
+        global_rep.append(nn.LayerNorm(transformer_dim))
+        self.global_rep = nn.Sequential(*global_rep)
+
+        self.conv_proj = conv_1x1_out
+        self.fusion = conv_3x3_out
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+        self.patch_area = self.patch_w * self.patch_h
+
+        self.cnn_in_dim = in_channels
+        self.cnn_out_dim = transformer_dim
+        self.n_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.ffn_dropout = ffn_dropout
+        self.n_blocks = n_transformer_blocks
+        self.conv_ksize = conv_ksize
+
+    def unfolding(self, x: Tensor) -> Tuple[Tensor, Dict]:
+        patch_w, patch_h = self.patch_w, self.patch_h
+        patch_area = patch_w * patch_h
+        batch_size, in_channels, orig_h, orig_w = x.shape
+
+        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+
+        interpolate = False
+        if new_w != orig_w or new_h != orig_h:
+            # Note: Padding can be done, but then it needs to be handled in attention function.
+            x = F.interpolate(x, size=(new_h, new_w), mode="bilinear", align_corners=False)
+            interpolate = True
+
+        # number of patches along width and height
+        num_patch_w = new_w // patch_w  # n_w
+        num_patch_h = new_h // patch_h  # n_h
+        num_patches = num_patch_h * num_patch_w  # N
+
+        # [B, C, H, W] -> [B * C * n_h, p_h, n_w, p_w]
+        x = x.reshape(batch_size * in_channels * num_patch_h, patch_h, num_patch_w, patch_w)
+        # [B * C * n_h, p_h, n_w, p_w] -> [B * C * n_h, n_w, p_h, p_w]
+        x = x.transpose(1, 2)
+        # [B * C * n_h, n_w, p_h, p_w] -> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
+        x = x.reshape(batch_size, in_channels, num_patches, patch_area)
+        # [B, C, N, P] -> [B, P, N, C]
+        x = x.transpose(1, 3)
+        # [B, P, N, C] -> [BP, N, C]
+        x = x.reshape(batch_size * patch_area, num_patches, -1)
+
+        info_dict = {
+            "orig_size": (orig_h, orig_w),
+            "batch_size": batch_size,
+            "interpolate": interpolate,
+            "total_patches": num_patches,
+            "num_patches_w": num_patch_w,
+            "num_patches_h": num_patch_h,
+        }
+
+        return x, info_dict
+
+    def folding(self, x: Tensor, info_dict: Dict) -> Tensor:
+        n_dim = x.dim()
+        assert n_dim == 3, "Tensor should be of shape BPxNxC. Got: {}".format(
+            x.shape
+        )
+        # [BP, N, C] --> [B, P, N, C]
+        x = x.contiguous().view(
+            info_dict["batch_size"], self.patch_area, info_dict["total_patches"], -1
+        )
+
+        batch_size, pixels, num_patches, channels = x.size()
+        num_patch_h = info_dict["num_patches_h"]
+        num_patch_w = info_dict["num_patches_w"]
+
+        # [B, P, N, C] -> [B, C, N, P]
+        x = x.transpose(1, 3)
+        # [B, C, N, P] -> [B*C*n_h, n_w, p_h, p_w]
+        x = x.reshape(batch_size * channels * num_patch_h, num_patch_w, self.patch_h, self.patch_w)
+        # [B*C*n_h, n_w, p_h, p_w] -> [B*C*n_h, p_h, n_w, p_w]
+        x = x.transpose(1, 2)
+        # [B*C*n_h, p_h, n_w, p_w] -> [B, C, H, W]
+        x = x.reshape(batch_size, channels, num_patch_h * self.patch_h, num_patch_w * self.patch_w)
+        if info_dict["interpolate"]:
+            x = F.interpolate(
+                x,
+                size=info_dict["orig_size"],
+                mode="bilinear",
+                align_corners=False,
+            )
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        res = x
+
+        fm = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, info_dict = self.unfolding(fm)
+
+        # learn global representations
+        for transformer_layer in self.global_rep:
+            patches = transformer_layer(patches)
+
+        # [B x Patch x Patches x C] -> [B x C x Patches x Patch]
+        fm = self.folding(x=patches, info_dict=info_dict)
+
+        fm = self.conv_proj(fm)
+
+        fm = self.fusion(torch.cat((res, fm), dim=1))
+        return fm
+
+
+class MobileViT(nn.Module):
+    """
+    This class implements the `MobileViT architecture <https://arxiv.org/abs/2110.02178?context=cs.LG>`_
+    """
+    def __init__(self, model_cfg: Dict, num_classes: int = 1000):
+        super().__init__()
+
+        image_channels = 3
+        out_channels = 16
+
+        self.conv_1 = ConvLayer(
+            in_channels=image_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2
+        )
+
+        self.layer_1, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer1"])
+        self.layer_2, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer2"])
+        self.layer_3, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer3"])
+        self.layer_4, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer4"])
+        self.layer_5, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer5"])
+
+        exp_channels = min(model_cfg["last_layer_exp_factor"] * out_channels, 960)
+        self.conv_1x1_exp = ConvLayer(
+            in_channels=out_channels,
+            out_channels=exp_channels,
+            kernel_size=1
+        )
+
+        self.classifier = nn.Sequential()
+        self.classifier.add_module(name="global_pool", module=nn.AdaptiveAvgPool2d(1))
+        self.classifier.add_module(name="flatten", module=nn.Flatten())
+        if 0.0 < model_cfg["cls_dropout"] < 1.0:
+            self.classifier.add_module(name="dropout", module=nn.Dropout(p=model_cfg["cls_dropout"]))
+        self.classifier.add_module(name="fc", module=nn.Linear(in_features=exp_channels, out_features=num_classes))
+
+        # weight init
+        self.apply(self.init_parameters)
+
+    def _make_layer(self, input_channel, cfg: Dict) -> Tuple[nn.Sequential, int]:
+        block_type = cfg.get("block_type", "mobilevit")
+        if block_type.lower() == "mobilevit":
+            return self._make_mit_layer(input_channel=input_channel, cfg=cfg)
+        else:
+            return self._make_mobilenet_layer(input_channel=input_channel, cfg=cfg)
+
+    @staticmethod
+    def _make_mobilenet_layer(input_channel: int, cfg: Dict) -> Tuple[nn.Sequential, int]:
+        output_channels = cfg.get("out_channels")
+        num_blocks = cfg.get("num_blocks", 2)
+        expand_ratio = cfg.get("expand_ratio", 4)
+        block = []
+
+        for i in range(num_blocks):
+            stride = cfg.get("stride", 1) if i == 0 else 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=output_channels,
+                stride=stride,
+                expand_ratio=expand_ratio
+            )
+            block.append(layer)
+            input_channel = output_channels
+
+        return nn.Sequential(*block), input_channel
+
+    @staticmethod
+    def _make_mit_layer(input_channel: int, cfg: Dict) -> [nn.Sequential, int]:
+        stride = cfg.get("stride", 1)
+        block = []
+
+        if stride == 2:
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=cfg.get("out_channels"),
+                stride=stride,
+                expand_ratio=cfg.get("mv_expand_ratio", 4)
+            )
+
+            block.append(layer)
+            input_channel = cfg.get("out_channels")
+
+        transformer_dim = cfg["transformer_channels"]
+        ffn_dim = cfg.get("ffn_dim")
+        num_heads = cfg.get("num_heads", 4)
+        head_dim = transformer_dim // num_heads
+
+        if transformer_dim % head_dim != 0:
+            raise ValueError("Transformer input dimension should be divisible by head dimension. "
+                             "Got {} and {}.".format(transformer_dim, head_dim))
+
+        block.append(MobileViTBlock(
+            in_channels=input_channel,
+            transformer_dim=transformer_dim,
+            ffn_dim=ffn_dim,
+            n_transformer_blocks=cfg.get("transformer_blocks", 1),
+            patch_h=cfg.get("patch_h", 2),
+            patch_w=cfg.get("patch_w", 2),
+            dropout=cfg.get("dropout", 0.1),
+            ffn_dropout=cfg.get("ffn_dropout", 0.0),
+            attn_dropout=cfg.get("attn_dropout", 0.1),
+            head_dim=head_dim,
+            conv_ksize=3
+        ))
+
+        return nn.Sequential(*block), input_channel
+
+    @staticmethod
+    def init_parameters(m):
+        if isinstance(m, nn.Conv2d):
+            if m.weight is not None:
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d)):
+            if m.weight is not None:
+                nn.init.ones_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, (nn.Linear,)):
+            if m.weight is not None:
+                nn.init.trunc_normal_(m.weight, mean=0.0, std=0.02)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        else:
+            pass
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv_1(x)
+        x = self.layer_1(x)
+        x = self.layer_2(x)
+
+        x = self.layer_3(x)
+        x = self.layer_4(x)
+        x = self.layer_5(x)
+        x = self.conv_1x1_exp(x)
+        x = self.classifier(x)
+        return x
+
+
+def mobile_vit_xx_small(num_classes: int = 1000):
+    # pretrain weight link
+    # https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_xxs.pt
+    config = get_config("xx_small")
+    m = MobileViT(config, num_classes=num_classes)
+    return m
+
+
+def mobile_vit_x_small(num_classes: int = 1000):
+    # pretrain weight link
+    # https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_xs.pt
+    config = get_config("x_small")
+    m = MobileViT(config, num_classes=num_classes)
+    return m
+
+
+def mobile_vit_small(num_classes: int = 1000):
+    # pretrain weight link
+    # https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_s.pt
+    config = get_config("small")
+    m = MobileViT(config, num_classes=num_classes)
+    return m
diff --git a/pytorch_classification/MobileViT/model_config.py b/pytorch_classification/MobileViT/model_config.py
new file mode 100644
index 000000000..932a0a0f0
--- /dev/null
+++ b/pytorch_classification/MobileViT/model_config.py
@@ -0,0 +1,176 @@
+def get_config(mode: str = "xxs") -> dict:
+    if mode == "xx_small":
+        mv2_exp_mult = 2
+        config = {
+            "layer1": {
+                "out_channels": 16,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2",
+            },
+            "layer2": {
+                "out_channels": 24,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2",
+            },
+            "layer3": {  # 28x28
+                "out_channels": 48,
+                "transformer_channels": 64,
+                "ffn_dim": 128,
+                "transformer_blocks": 2,
+                "patch_h": 2,  # 8,
+                "patch_w": 2,  # 8,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "layer4": {  # 14x14
+                "out_channels": 64,
+                "transformer_channels": 80,
+                "ffn_dim": 160,
+                "transformer_blocks": 4,
+                "patch_h": 2,  # 4,
+                "patch_w": 2,  # 4,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "layer5": {  # 7x7
+                "out_channels": 80,
+                "transformer_channels": 96,
+                "ffn_dim": 192,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "last_layer_exp_factor": 4,
+            "cls_dropout": 0.1
+        }
+    elif mode == "x_small":
+        mv2_exp_mult = 4
+        config = {
+            "layer1": {
+                "out_channels": 32,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2",
+            },
+            "layer2": {
+                "out_channels": 48,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2",
+            },
+            "layer3": {  # 28x28
+                "out_channels": 64,
+                "transformer_channels": 96,
+                "ffn_dim": 192,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "layer4": {  # 14x14
+                "out_channels": 80,
+                "transformer_channels": 120,
+                "ffn_dim": 240,
+                "transformer_blocks": 4,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "layer5": {  # 7x7
+                "out_channels": 96,
+                "transformer_channels": 144,
+                "ffn_dim": 288,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "last_layer_exp_factor": 4,
+            "cls_dropout": 0.1
+        }
+    elif mode == "small":
+        mv2_exp_mult = 4
+        config = {
+            "layer1": {
+                "out_channels": 32,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2",
+            },
+            "layer2": {
+                "out_channels": 64,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2",
+            },
+            "layer3": {  # 28x28
+                "out_channels": 96,
+                "transformer_channels": 144,
+                "ffn_dim": 288,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "layer4": {  # 14x14
+                "out_channels": 128,
+                "transformer_channels": 192,
+                "ffn_dim": 384,
+                "transformer_blocks": 4,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "layer5": {  # 7x7
+                "out_channels": 160,
+                "transformer_channels": 240,
+                "ffn_dim": 480,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "num_heads": 4,
+                "block_type": "mobilevit",
+            },
+            "last_layer_exp_factor": 4,
+            "cls_dropout": 0.1
+        }
+    else:
+        raise NotImplementedError
+
+    for k in ["layer1", "layer2", "layer3", "layer4", "layer5"]:
+        config[k].update({"dropout": 0.1, "ffn_dropout": 0.0, "attn_dropout": 0.0})
+
+    return config
diff --git a/pytorch_classification/MobileViT/my_dataset.py b/pytorch_classification/MobileViT/my_dataset.py
new file mode 100644
index 000000000..167bc9a30
--- /dev/null
+++ b/pytorch_classification/MobileViT/my_dataset.py
@@ -0,0 +1,37 @@
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+
+
+class MyDataSet(Dataset):
+    """自定义数据集"""
+
+    def __init__(self, images_path: list, images_class: list, transform=None):
+        self.images_path = images_path
+        self.images_class = images_class
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.images_path)
+
+    def __getitem__(self, item):
+        img = Image.open(self.images_path[item])
+        # RGB为彩色图片，L为灰度图片
+        if img.mode != 'RGB':
+            raise ValueError("image: {} isn't RGB mode.".format(self.images_path[item]))
+        label = self.images_class[item]
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        return img, label
+
+    @staticmethod
+    def collate_fn(batch):
+        # 官方实现的default_collate可以参考
+        # https://github.com/pytorch/pytorch/blob/67b7e751e6b5931a9f45274653f4f653a4e6cdf6/torch/utils/data/_utils/collate.py
+        images, labels = tuple(zip(*batch))
+
+        images = torch.stack(images, dim=0)
+        labels = torch.as_tensor(labels)
+        return images, labels
diff --git a/pytorch_classification/MobileViT/predict.py b/pytorch_classification/MobileViT/predict.py
new file mode 100644
index 000000000..525260912
--- /dev/null
+++ b/pytorch_classification/MobileViT/predict.py
@@ -0,0 +1,61 @@
+import os
+import json
+
+import torch
+from PIL import Image
+from torchvision import transforms
+import matplotlib.pyplot as plt
+
+from model import mobile_vit_xx_small as create_model
+
+
+def main():
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    img_size = 224
+    data_transform = transforms.Compose(
+        [transforms.Resize(int(img_size * 1.14)),
+         transforms.CenterCrop(img_size),
+         transforms.ToTensor(),
+         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
+
+    # load image
+    img_path = "../tulip.jpg"
+    assert os.path.exists(img_path), "file: '{}' dose not exist.".format(img_path)
+    img = Image.open(img_path)
+    plt.imshow(img)
+    # [N, C, H, W]
+    img = data_transform(img)
+    # expand batch dimension
+    img = torch.unsqueeze(img, dim=0)
+
+    # read class_indict
+    json_path = './class_indices.json'
+    assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
+
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
+
+    # create model
+    model = create_model(num_classes=5).to(device)
+    # load model weights
+    model_weight_path = "./weights/best_model.pth"
+    model.load_state_dict(torch.load(model_weight_path, map_location=device))
+    model.eval()
+    with torch.no_grad():
+        # predict class
+        output = torch.squeeze(model(img.to(device))).cpu()
+        predict = torch.softmax(output, dim=0)
+        predict_cla = torch.argmax(predict).numpy()
+
+    print_res = "class: {}   prob: {:.3}".format(class_indict[str(predict_cla)],
+                                                 predict[predict_cla].numpy())
+    plt.title(print_res)
+    for i in range(len(predict)):
+        print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
+                                                  predict[i].numpy()))
+    plt.show()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pytorch_classification/MobileViT/train.py b/pytorch_classification/MobileViT/train.py
new file mode 100644
index 000000000..edb26ee98
--- /dev/null
+++ b/pytorch_classification/MobileViT/train.py
@@ -0,0 +1,135 @@
+import os
+import argparse
+
+import torch
+import torch.optim as optim
+from torch.utils.tensorboard import SummaryWriter
+from torchvision import transforms
+
+from my_dataset import MyDataSet
+from model import mobile_vit_xx_small as create_model
+from utils import read_split_data, train_one_epoch, evaluate
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+
+    if os.path.exists("./weights") is False:
+        os.makedirs("./weights")
+
+    tb_writer = SummaryWriter()
+
+    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(args.data_path)
+
+    img_size = 224
+    data_transform = {
+        "train": transforms.Compose([transforms.RandomResizedCrop(img_size),
+                                     transforms.RandomHorizontalFlip(),
+                                     transforms.ToTensor(),
+                                     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
+        "val": transforms.Compose([transforms.Resize(int(img_size * 1.143)),
+                                   transforms.CenterCrop(img_size),
+                                   transforms.ToTensor(),
+                                   transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])}
+
+    # 实例化训练数据集
+    train_dataset = MyDataSet(images_path=train_images_path,
+                              images_class=train_images_label,
+                              transform=data_transform["train"])
+
+    # 实例化验证数据集
+    val_dataset = MyDataSet(images_path=val_images_path,
+                            images_class=val_images_label,
+                            transform=data_transform["val"])
+
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using {} dataloader workers every process'.format(nw))
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=batch_size,
+                                               shuffle=True,
+                                               pin_memory=True,
+                                               num_workers=nw,
+                                               collate_fn=train_dataset.collate_fn)
+
+    val_loader = torch.utils.data.DataLoader(val_dataset,
+                                             batch_size=batch_size,
+                                             shuffle=False,
+                                             pin_memory=True,
+                                             num_workers=nw,
+                                             collate_fn=val_dataset.collate_fn)
+
+    model = create_model(num_classes=args.num_classes).to(device)
+
+    if args.weights != "":
+        assert os.path.exists(args.weights), "weights file: '{}' not exist.".format(args.weights)
+        weights_dict = torch.load(args.weights, map_location=device)
+        weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+        # 删除有关分类类别的权重
+        for k in list(weights_dict.keys()):
+            if "classifier" in k:
+                del weights_dict[k]
+        print(model.load_state_dict(weights_dict, strict=False))
+
+    if args.freeze_layers:
+        for name, para in model.named_parameters():
+            # 除head外，其他权重全部冻结
+            if "classifier" not in name:
+                para.requires_grad_(False)
+            else:
+                print("training {}".format(name))
+
+    pg = [p for p in model.parameters() if p.requires_grad]
+    optimizer = optim.AdamW(pg, lr=args.lr, weight_decay=1E-2)
+
+    best_acc = 0.
+    for epoch in range(args.epochs):
+        # train
+        train_loss, train_acc = train_one_epoch(model=model,
+                                                optimizer=optimizer,
+                                                data_loader=train_loader,
+                                                device=device,
+                                                epoch=epoch)
+
+        # validate
+        val_loss, val_acc = evaluate(model=model,
+                                     data_loader=val_loader,
+                                     device=device,
+                                     epoch=epoch)
+
+        tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
+        tb_writer.add_scalar(tags[0], train_loss, epoch)
+        tb_writer.add_scalar(tags[1], train_acc, epoch)
+        tb_writer.add_scalar(tags[2], val_loss, epoch)
+        tb_writer.add_scalar(tags[3], val_acc, epoch)
+        tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)
+
+        if val_acc > best_acc:
+            best_acc = val_acc
+            torch.save(model.state_dict(), "./weights/best_model.pth")
+
+        torch.save(model.state_dict(), "./weights/latest_model.pth")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_classes', type=int, default=5)
+    parser.add_argument('--epochs', type=int, default=10)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--lr', type=float, default=0.0002)
+
+    # 数据集所在根目录
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
+    parser.add_argument('--data-path', type=str,
+                        default="/data/flower_photos")
+
+    # 预训练权重路径，如果不想载入就设置为空字符
+    parser.add_argument('--weights', type=str, default='./mobilevit_xxs.pt',
+                        help='initial weights path')
+    # 是否冻结权重
+    parser.add_argument('--freeze-layers', type=bool, default=False)
+    parser.add_argument('--device', default='cuda:0', help='device id (i.e. 0 or 0,1 or cpu)')
+
+    opt = parser.parse_args()
+
+    main(opt)
diff --git a/pytorch_classification/MobileViT/transformer.py b/pytorch_classification/MobileViT/transformer.py
new file mode 100644
index 000000000..1124820df
--- /dev/null
+++ b/pytorch_classification/MobileViT/transformer.py
@@ -0,0 +1,155 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+class MultiHeadAttention(nn.Module):
+    """
+    This layer applies a multi-head self- or cross-attention as described in
+    `Attention is all you need <https://arxiv.org/abs/1706.03762>`_ paper
+
+    Args:
+        embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(N, P, C_{in})`
+        num_heads (int): Number of heads in multi-head attention
+        attn_dropout (float): Attention dropout. Default: 0.0
+        bias (bool): Use bias or not. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, P, C_{in})` where :math:`N` is batch size, :math:`P` is number of patches,
+        and :math:`C_{in}` is input embedding dim
+        - Output: same shape as the input
+
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        attn_dropout: float = 0.0,
+        bias: bool = True,
+        *args,
+        **kwargs
+    ) -> None:
+        super().__init__()
+        if embed_dim % num_heads != 0:
+            raise ValueError(
+                "Embedding dim must be divisible by number of heads in {}. Got: embed_dim={} and num_heads={}".format(
+                    self.__class__.__name__, embed_dim, num_heads
+                )
+            )
+
+        self.qkv_proj = nn.Linear(in_features=embed_dim, out_features=3 * embed_dim, bias=bias)
+
+        self.attn_dropout = nn.Dropout(p=attn_dropout)
+        self.out_proj = nn.Linear(in_features=embed_dim, out_features=embed_dim, bias=bias)
+
+        self.head_dim = embed_dim // num_heads
+        self.scaling = self.head_dim ** -0.5
+        self.softmax = nn.Softmax(dim=-1)
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+
+    def forward(self, x_q: Tensor) -> Tensor:
+        # [N, P, C]
+        b_sz, n_patches, in_channels = x_q.shape
+
+        # self-attention
+        # [N, P, C] -> [N, P, 3C] -> [N, P, 3, h, c] where C = hc
+        qkv = self.qkv_proj(x_q).reshape(b_sz, n_patches, 3, self.num_heads, -1)
+
+        # [N, P, 3, h, c] -> [N, h, 3, P, C]
+        qkv = qkv.transpose(1, 3).contiguous()
+
+        # [N, h, 3, P, C] -> [N, h, P, C] x 3
+        query, key, value = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
+
+        query = query * self.scaling
+
+        # [N h, P, c] -> [N, h, c, P]
+        key = key.transpose(-1, -2)
+
+        # QK^T
+        # [N, h, P, c] x [N, h, c, P] -> [N, h, P, P]
+        attn = torch.matmul(query, key)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        # weighted sum
+        # [N, h, P, P] x [N, h, P, c] -> [N, h, P, c]
+        out = torch.matmul(attn, value)
+
+        # [N, h, P, c] -> [N, P, h, c] -> [N, P, C]
+        out = out.transpose(1, 2).reshape(b_sz, n_patches, -1)
+        out = self.out_proj(out)
+
+        return out
+
+
+class TransformerEncoder(nn.Module):
+    """
+    This class defines the pre-norm `Transformer encoder <https://arxiv.org/abs/1706.03762>`_
+    Args:
+        embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(N, P, C_{in})`
+        ffn_latent_dim (int): Inner dimension of the FFN
+        num_heads (int) : Number of heads in multi-head attention. Default: 8
+        attn_dropout (float): Dropout rate for attention in multi-head attention. Default: 0.0
+        dropout (float): Dropout rate. Default: 0.0
+        ffn_dropout (float): Dropout between FFN layers. Default: 0.0
+
+    Shape:
+        - Input: :math:`(N, P, C_{in})` where :math:`N` is batch size, :math:`P` is number of patches,
+        and :math:`C_{in}` is input embedding dim
+        - Output: same shape as the input
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        ffn_latent_dim: int,
+        num_heads: Optional[int] = 8,
+        attn_dropout: Optional[float] = 0.0,
+        dropout: Optional[float] = 0.0,
+        ffn_dropout: Optional[float] = 0.0,
+        *args,
+        **kwargs
+    ) -> None:
+
+        super().__init__()
+
+        attn_unit = MultiHeadAttention(
+            embed_dim,
+            num_heads,
+            attn_dropout=attn_dropout,
+            bias=True
+        )
+
+        self.pre_norm_mha = nn.Sequential(
+            nn.LayerNorm(embed_dim),
+            attn_unit,
+            nn.Dropout(p=dropout)
+        )
+
+        self.pre_norm_ffn = nn.Sequential(
+            nn.LayerNorm(embed_dim),
+            nn.Linear(in_features=embed_dim, out_features=ffn_latent_dim, bias=True),
+            nn.SiLU(),
+            nn.Dropout(p=ffn_dropout),
+            nn.Linear(in_features=ffn_latent_dim, out_features=embed_dim, bias=True),
+            nn.Dropout(p=dropout)
+        )
+        self.embed_dim = embed_dim
+        self.ffn_dim = ffn_latent_dim
+        self.ffn_dropout = ffn_dropout
+        self.std_dropout = dropout
+
+    def forward(self, x: Tensor) -> Tensor:
+        # multi-head attention
+        res = x
+        x = self.pre_norm_mha(x)
+        x = x + res
+
+        # feed forward network
+        x = x + self.pre_norm_ffn(x)
+        return x
diff --git a/pytorch_classification/MobileViT/unfold_test.py b/pytorch_classification/MobileViT/unfold_test.py
new file mode 100644
index 000000000..6370a4b7d
--- /dev/null
+++ b/pytorch_classification/MobileViT/unfold_test.py
@@ -0,0 +1,56 @@
+import time
+import torch
+
+batch_size = 8
+in_channels = 32
+patch_h = 2
+patch_w = 2
+num_patch_h = 16
+num_patch_w = 16
+num_patches = num_patch_h * num_patch_w
+patch_area = patch_h * patch_w
+
+
+def official(x: torch.Tensor):
+    # [B, C, H, W] -> [B * C * n_h, p_h, n_w, p_w]
+    x = x.reshape(batch_size * in_channels * num_patch_h, patch_h, num_patch_w, patch_w)
+    # [B * C * n_h, p_h, n_w, p_w] -> [B * C * n_h, n_w, p_h, p_w]
+    x = x.transpose(1, 2)
+    # [B * C * n_h, n_w, p_h, p_w] -> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
+    x = x.reshape(batch_size, in_channels, num_patches, patch_area)
+    # [B, C, N, P] -> [B, P, N, C]
+    x = x.transpose(1, 3)
+    # [B, P, N, C] -> [BP, N, C]
+    x = x.reshape(batch_size * patch_area, num_patches, -1)
+
+    return x
+
+
+def my_self(x: torch.Tensor):
+    # [B, C, H, W] -> [B, C, n_h, p_h, n_w, p_w]
+    x = x.reshape(batch_size, in_channels, num_patch_h, patch_h, num_patch_w, patch_w)
+    # [B, C, n_h, p_h, n_w, p_w] -> [B, C, n_h, n_w, p_h, p_w]
+    x = x.transpose(3, 4)
+    # [B, C, n_h, n_w, p_h, p_w] -> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
+    x = x.reshape(batch_size, in_channels, num_patches, patch_area)
+    # [B, C, N, P] -> [B, P, N, C]
+    x = x.transpose(1, 3)
+    # [B, P, N, C] -> [BP, N, C]
+    x = x.reshape(batch_size * patch_area, num_patches, -1)
+
+    return x
+
+
+if __name__ == '__main__':
+    t = torch.randn(batch_size, in_channels, num_patch_h * patch_h, num_patch_w * patch_w)
+    print(torch.equal(official(t), my_self(t)))
+
+    t1 = time.time()
+    for _ in range(1000):
+        official(t)
+    print(f"official time: {time.time() - t1}")
+
+    t1 = time.time()
+    for _ in range(1000):
+        my_self(t)
+    print(f"self time: {time.time() - t1}")
diff --git a/pytorch_classification/MobileViT/utils.py b/pytorch_classification/MobileViT/utils.py
new file mode 100644
index 000000000..da201e6eb
--- /dev/null
+++ b/pytorch_classification/MobileViT/utils.py
@@ -0,0 +1,179 @@
+import os
+import sys
+import json
+import pickle
+import random
+
+import torch
+from tqdm import tqdm
+
+import matplotlib.pyplot as plt
+
+
+def read_split_data(root: str, val_rate: float = 0.2):
+    random.seed(0)  # 保证随机结果可复现
+    assert os.path.exists(root), "dataset root: {} does not exist.".format(root)
+
+    # 遍历文件夹，一个文件夹对应一个类别
+    flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
+    # 排序，保证各平台顺序一致
+    flower_class.sort()
+    # 生成类别名称以及对应的数字索引
+    class_indices = dict((k, v) for v, k in enumerate(flower_class))
+    json_str = json.dumps(dict((val, key) for key, val in class_indices.items()), indent=4)
+    with open('class_indices.json', 'w') as json_file:
+        json_file.write(json_str)
+
+    train_images_path = []  # 存储训练集的所有图片路径
+    train_images_label = []  # 存储训练集图片对应索引信息
+    val_images_path = []  # 存储验证集的所有图片路径
+    val_images_label = []  # 存储验证集图片对应索引信息
+    every_class_num = []  # 存储每个类别的样本总数
+    supported = [".jpg", ".JPG", ".png", ".PNG"]  # 支持的文件后缀类型
+    # 遍历每个文件夹下的文件
+    for cla in flower_class:
+        cla_path = os.path.join(root, cla)
+        # 遍历获取supported支持的所有文件路径
+        images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
+                  if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
+        # 获取该类别对应的索引
+        image_class = class_indices[cla]
+        # 记录该类别的样本数量
+        every_class_num.append(len(images))
+        # 按比例随机采样验证样本
+        val_path = random.sample(images, k=int(len(images) * val_rate))
+
+        for img_path in images:
+            if img_path in val_path:  # 如果该路径在采样的验证集样本中则存入验证集
+                val_images_path.append(img_path)
+                val_images_label.append(image_class)
+            else:  # 否则存入训练集
+                train_images_path.append(img_path)
+                train_images_label.append(image_class)
+
+    print("{} images were found in the dataset.".format(sum(every_class_num)))
+    print("{} images for training.".format(len(train_images_path)))
+    print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
+
+    plot_image = False
+    if plot_image:
+        # 绘制每种类别个数柱状图
+        plt.bar(range(len(flower_class)), every_class_num, align='center')
+        # 将横坐标0,1,2,3,4替换为相应的类别名称
+        plt.xticks(range(len(flower_class)), flower_class)
+        # 在柱状图上添加数值标签
+        for i, v in enumerate(every_class_num):
+            plt.text(x=i, y=v + 5, s=str(v), ha='center')
+        # 设置x坐标
+        plt.xlabel('image class')
+        # 设置y坐标
+        plt.ylabel('number of images')
+        # 设置柱状图的标题
+        plt.title('flower class distribution')
+        plt.show()
+
+    return train_images_path, train_images_label, val_images_path, val_images_label
+
+
+def plot_data_loader_image(data_loader):
+    batch_size = data_loader.batch_size
+    plot_num = min(batch_size, 4)
+
+    json_path = './class_indices.json'
+    assert os.path.exists(json_path), json_path + " does not exist."
+    json_file = open(json_path, 'r')
+    class_indices = json.load(json_file)
+
+    for data in data_loader:
+        images, labels = data
+        for i in range(plot_num):
+            # [C, H, W] -> [H, W, C]
+            img = images[i].numpy().transpose(1, 2, 0)
+            # 反Normalize操作
+            img = (img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]) * 255
+            label = labels[i].item()
+            plt.subplot(1, plot_num, i+1)
+            plt.xlabel(class_indices[str(label)])
+            plt.xticks([])  # 去掉x轴的刻度
+            plt.yticks([])  # 去掉y轴的刻度
+            plt.imshow(img.astype('uint8'))
+        plt.show()
+
+
+def write_pickle(list_info: list, file_name: str):
+    with open(file_name, 'wb') as f:
+        pickle.dump(list_info, f)
+
+
+def read_pickle(file_name: str) -> list:
+    with open(file_name, 'rb') as f:
+        info_list = pickle.load(f)
+        return info_list
+
+
+def train_one_epoch(model, optimizer, data_loader, device, epoch):
+    model.train()
+    loss_function = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
+    accu_loss = torch.zeros(1).to(device)  # 累计损失
+    accu_num = torch.zeros(1).to(device)   # 累计预测正确的样本数
+    optimizer.zero_grad()
+
+    sample_num = 0
+    data_loader = tqdm(data_loader, file=sys.stdout)
+    for step, data in enumerate(data_loader):
+        images, labels = data
+        sample_num += images.shape[0]
+
+        pred = model(images.to(device))
+        pred_classes = torch.max(pred, dim=1)[1]
+        accu_num += torch.eq(pred_classes, labels.to(device)).sum()
+
+        loss = loss_function(pred, labels.to(device))
+        loss.backward()
+        accu_loss += loss.detach()
+
+        data_loader.desc = "[train epoch {}] loss: {:.3f}, acc: {:.3f}".format(epoch,
+                                                                               accu_loss.item() / (step + 1),
+                                                                               accu_num.item() / sample_num)
+
+        if not torch.isfinite(loss):
+            print('WARNING: non-finite loss, ending training ', loss)
+            sys.exit(1)
+
+        optimizer.step()
+        optimizer.zero_grad()
+
+    return accu_loss.item() / (step + 1), accu_num.item() / sample_num
+
+
+@torch.no_grad()
+def evaluate(model, data_loader, device, epoch):
+    loss_function = torch.nn.CrossEntropyLoss()
+
+    model.eval()
+
+    accu_num = torch.zeros(1).to(device)   # 累计预测正确的样本数
+    accu_loss = torch.zeros(1).to(device)  # 累计损失
+
+    sample_num = 0
+    data_loader = tqdm(data_loader, file=sys.stdout)
+    for step, data in enumerate(data_loader):
+        images, labels = data
+        sample_num += images.shape[0]
+
+        pred = model(images.to(device))
+        pred_classes = torch.max(pred, dim=1)[1]
+        accu_num += torch.eq(pred_classes, labels.to(device)).sum()
+
+        loss = loss_function(pred, labels.to(device))
+        accu_loss += loss
+
+        data_loader.desc = "[valid epoch {}] loss: {:.3f}, acc: {:.3f}".format(epoch,
+                                                                               accu_loss.item() / (step + 1),
+                                                                               accu_num.item() / sample_num)
+
+    return accu_loss.item() / (step + 1), accu_num.item() / sample_num
diff --git a/pytorch_classification/Test10_regnet/README.md b/pytorch_classification/Test10_regnet/README.md
new file mode 100644
index 000000000..4b41177f8
--- /dev/null
+++ b/pytorch_classification/Test10_regnet/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，根据自己使用的模型下载对应预训练权重: https://pan.baidu.com/s/1XTo3walj9ai7ZhWz7jh-YA  密码: 8lmu
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/Test10_regnet/predict.py b/pytorch_classification/Test10_regnet/predict.py
index d0f9b21b2..32df3cb2a 100644
--- a/pytorch_classification/Test10_regnet/predict.py
+++ b/pytorch_classification/Test10_regnet/predict.py
@@ -32,8 +32,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_regnet(model_name="RegNetY_400MF", num_classes=5).to(device)
diff --git a/pytorch_classification/Test10_regnet/train.py b/pytorch_classification/Test10_regnet/train.py
index 1a95cf567..19ce89940 100644
--- a/pytorch_classification/Test10_regnet/train.py
+++ b/pytorch_classification/Test10_regnet/train.py
@@ -123,7 +123,7 @@ def main(args):
     parser.add_argument('--lrf', type=float, default=0.01)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str,
                         default="/data/flower_photos")
     parser.add_argument('--model-name', default='RegNetY_400MF', help='create model name')
diff --git a/pytorch_classification/Test10_regnet/utils.py b/pytorch_classification/Test10_regnet/utils.py
index f4355900b..11f677974 100644
--- a/pytorch_classification/Test10_regnet/utils.py
+++ b/pytorch_classification/Test10_regnet/utils.py
@@ -16,7 +16,7 @@ def read_split_data(root: str, val_rate: float = 0.2):
 
     # 遍历文件夹，一个文件夹对应一个类别
     flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
-    # 排序，保证顺序一致
+    # 排序，保证各平台顺序一致
     flower_class.sort()
     # 生成类别名称以及对应的数字索引
     class_indices = dict((k, v) for v, k in enumerate(flower_class))
@@ -36,6 +36,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
         # 遍历获取supported支持的所有文件路径
         images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
                   if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
         # 获取该类别对应的索引
         image_class = class_indices[cla]
         # 记录该类别的样本数量
@@ -54,6 +56,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
     print("{} images were found in the dataset.".format(sum(every_class_num)))
     print("{} images for training.".format(len(train_images_path)))
     print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
 
     plot_image = False
     if plot_image:
diff --git a/pytorch_classification/Test11_efficientnetV2/README.md b/pytorch_classification/Test11_efficientnetV2/README.md
new file mode 100644
index 000000000..36fb99997
--- /dev/null
+++ b/pytorch_classification/Test11_efficientnetV2/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，根据自己使用的模型下载对应预训练权重: https://pan.baidu.com/s/1uZX36rvrfEss-JGj4yfzbQ  密码: 5gu1
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/Test11_efficientnetV2/predict.py b/pytorch_classification/Test11_efficientnetV2/predict.py
index d803571c6..690ddec6b 100644
--- a/pytorch_classification/Test11_efficientnetV2/predict.py
+++ b/pytorch_classification/Test11_efficientnetV2/predict.py
@@ -37,8 +37,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_model(num_classes=5).to(device)
diff --git a/pytorch_classification/Test11_efficientnetV2/train.py b/pytorch_classification/Test11_efficientnetV2/train.py
index 7aaab9b5e..cfe08bff1 100644
--- a/pytorch_classification/Test11_efficientnetV2/train.py
+++ b/pytorch_classification/Test11_efficientnetV2/train.py
@@ -127,7 +127,7 @@ def main(args):
     parser.add_argument('--lrf', type=float, default=0.01)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str,
                         default="/data/flower_photos")
 
diff --git a/pytorch_classification/Test11_efficientnetV2/utils.py b/pytorch_classification/Test11_efficientnetV2/utils.py
index 96ad54a4b..23c53a06f 100644
--- a/pytorch_classification/Test11_efficientnetV2/utils.py
+++ b/pytorch_classification/Test11_efficientnetV2/utils.py
@@ -16,7 +16,7 @@ def read_split_data(root: str, val_rate: float = 0.2):
 
     # 遍历文件夹，一个文件夹对应一个类别
     flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
-    # 排序，保证顺序一致
+    # 排序，保证各平台顺序一致
     flower_class.sort()
     # 生成类别名称以及对应的数字索引
     class_indices = dict((k, v) for v, k in enumerate(flower_class))
@@ -36,6 +36,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
         # 遍历获取supported支持的所有文件路径
         images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
                   if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
         # 获取该类别对应的索引
         image_class = class_indices[cla]
         # 记录该类别的样本数量
@@ -54,6 +56,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
     print("{} images were found in the dataset.".format(sum(every_class_num)))
     print("{} images for training.".format(len(train_images_path)))
     print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
 
     plot_image = False
     if plot_image:
diff --git a/pytorch_classification/Test1_official_demo/predict.py b/pytorch_classification/Test1_official_demo/predict.py
index b1b597c00..c0ecf31f9 100644
--- a/pytorch_classification/Test1_official_demo/predict.py
+++ b/pytorch_classification/Test1_official_demo/predict.py
@@ -23,7 +23,7 @@ def main():
 
     with torch.no_grad():
         outputs = net(im)
-        predict = torch.max(outputs, dim=1)[1].data.numpy()
+        predict = torch.max(outputs, dim=1)[1].numpy()
     print(classes[int(predict)])
 
 
diff --git a/pytorch_classification/Test1_official_demo/train.py b/pytorch_classification/Test1_official_demo/train.py
index ae935ce03..fd61ddae2 100644
--- a/pytorch_classification/Test1_official_demo/train.py
+++ b/pytorch_classification/Test1_official_demo/train.py
@@ -25,7 +25,7 @@ def main():
     val_loader = torch.utils.data.DataLoader(val_set, batch_size=5000,
                                              shuffle=False, num_workers=0)
     val_data_iter = iter(val_loader)
-    val_image, val_label = val_data_iter.next()
+    val_image, val_label = next(val_data_iter)
     
     # classes = ('plane', 'car', 'bird', 'cat',
     #            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
diff --git a/pytorch_classification/Test2_alexnet/predict.py b/pytorch_classification/Test2_alexnet/predict.py
index 3b2fc1d7b..e96329867 100644
--- a/pytorch_classification/Test2_alexnet/predict.py
+++ b/pytorch_classification/Test2_alexnet/predict.py
@@ -32,8 +32,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = AlexNet(num_classes=5).to(device)
diff --git a/pytorch_classification/Test3_vggnet/predict.py b/pytorch_classification/Test3_vggnet/predict.py
index 248d4cbbc..a0375e9b7 100644
--- a/pytorch_classification/Test3_vggnet/predict.py
+++ b/pytorch_classification/Test3_vggnet/predict.py
@@ -31,8 +31,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
     
     # create model
     model = vgg(model_name="vgg16", num_classes=5).to(device)
diff --git a/pytorch_classification/Test4_googlenet/model.py b/pytorch_classification/Test4_googlenet/model.py
index 2282c56e9..954de7191 100644
--- a/pytorch_classification/Test4_googlenet/model.py
+++ b/pytorch_classification/Test4_googlenet/model.py
@@ -116,6 +116,8 @@ def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_pr
 
         self.branch3 = nn.Sequential(
             BasicConv2d(in_channels, ch5x5red, kernel_size=1),
+            # 在官方的实现中，其实是3x3的kernel并不是5x5，这里我也懒得改了，具体可以参考下面的issue
+            # Please see https://github.com/pytorch/vision/issues/906 for details.
             BasicConv2d(ch5x5red, ch5x5, kernel_size=5, padding=2)   # 保证输出大小等于输入大小
         )
 
diff --git a/pytorch_classification/Test4_googlenet/predict.py b/pytorch_classification/Test4_googlenet/predict.py
index 11955e308..d91011fc8 100644
--- a/pytorch_classification/Test4_googlenet/predict.py
+++ b/pytorch_classification/Test4_googlenet/predict.py
@@ -31,8 +31,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = GoogLeNet(num_classes=5, aux_logits=False).to(device)
diff --git a/pytorch_classification/Test4_googlenet/train.py b/pytorch_classification/Test4_googlenet/train.py
index 0218478fd..32f8e0c10 100644
--- a/pytorch_classification/Test4_googlenet/train.py
+++ b/pytorch_classification/Test4_googlenet/train.py
@@ -60,8 +60,13 @@ def main():
     # test_data_iter = iter(validate_loader)
     # test_image, test_label = test_data_iter.next()
 
+    net = GoogLeNet(num_classes=5, aux_logits=True, init_weights=True)
+    # 如果要使用官方的预训练权重，注意是将权重载入官方的模型，不是我们自己实现的模型
+    # 官方的模型中使用了bn层以及改了一些参数，不能混用
+    # import torchvision
     # net = torchvision.models.googlenet(num_classes=5)
     # model_dict = net.state_dict()
+    # # 预训练权重下载地址: https://download.pytorch.org/models/googlenet-1378be20.pth
     # pretrain_model = torch.load("googlenet.pth")
     # del_list = ["aux1.fc2.weight", "aux1.fc2.bias",
     #             "aux2.fc2.weight", "aux2.fc2.bias",
@@ -69,7 +74,6 @@ def main():
     # pretrain_dict = {k: v for k, v in pretrain_model.items() if k not in del_list}
     # model_dict.update(pretrain_dict)
     # net.load_state_dict(model_dict)
-    net = GoogLeNet(num_classes=5, aux_logits=True, init_weights=True)
     net.to(device)
     loss_function = nn.CrossEntropyLoss()
     optimizer = optim.Adam(net.parameters(), lr=0.0003)
diff --git a/pytorch_classification/Test5_resnet/predict.py b/pytorch_classification/Test5_resnet/predict.py
index c327741c7..f478b3bfd 100644
--- a/pytorch_classification/Test5_resnet/predict.py
+++ b/pytorch_classification/Test5_resnet/predict.py
@@ -32,8 +32,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = resnet34(num_classes=5).to(device)
diff --git a/pytorch_classification/Test5_resnet/train.py b/pytorch_classification/Test5_resnet/train.py
index 2f8befdd9..310b462ce 100644
--- a/pytorch_classification/Test5_resnet/train.py
+++ b/pytorch_classification/Test5_resnet/train.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import json
 
 import torch
@@ -62,7 +63,7 @@ def main():
     # download url: https://download.pytorch.org/models/resnet34-333f7ec4.pth
     model_weight_path = "./resnet34-pre.pth"
     assert os.path.exists(model_weight_path), "file {} does not exist.".format(model_weight_path)
-    net.load_state_dict(torch.load(model_weight_path, map_location=device))
+    net.load_state_dict(torch.load(model_weight_path, map_location='cpu'))
     # for param in net.parameters():
     #     param.requires_grad = False
 
diff --git a/pytorch_classification/Test6_mobilenet/predict.py b/pytorch_classification/Test6_mobilenet/predict.py
index a8a03ceb9..a0e6df088 100644
--- a/pytorch_classification/Test6_mobilenet/predict.py
+++ b/pytorch_classification/Test6_mobilenet/predict.py
@@ -32,8 +32,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = MobileNetV2(num_classes=5).to(device)
diff --git a/pytorch_classification/Test6_mobilenet/train.py b/pytorch_classification/Test6_mobilenet/train.py
index 594185467..0fe629212 100644
--- a/pytorch_classification/Test6_mobilenet/train.py
+++ b/pytorch_classification/Test6_mobilenet/train.py
@@ -67,7 +67,7 @@ def main():
     # download url: https://download.pytorch.org/models/mobilenet_v2-b0353104.pth
     model_weight_path = "./mobilenet_v2.pth"
     assert os.path.exists(model_weight_path), "file {} dose not exist.".format(model_weight_path)
-    pre_weights = torch.load(model_weight_path, map_location=device)
+    pre_weights = torch.load(model_weight_path, map_location='cpu')
 
     # delete classifier weights
     pre_dict = {k: v for k, v in pre_weights.items() if net.state_dict()[k].numel() == v.numel()}
diff --git a/pytorch_classification/Test7_shufflenet/README.md b/pytorch_classification/Test7_shufflenet/README.md
new file mode 100644
index 000000000..c93d9df0e
--- /dev/null
+++ b/pytorch_classification/Test7_shufflenet/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，在`model.py`文件中每个模型都有提供预训练权重的下载地址，根据自己使用的模型下载对应预训练权重
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/Test7_shufflenet/model.py b/pytorch_classification/Test7_shufflenet/model.py
index adc1dfa48..dbdb81967 100644
--- a/pytorch_classification/Test7_shufflenet/model.py
+++ b/pytorch_classification/Test7_shufflenet/model.py
@@ -147,6 +147,23 @@ def forward(self, x: Tensor) -> Tensor:
         return self._forward_impl(x)
 
 
+def shufflenet_v2_x0_5(num_classes=1000):
+    """
+    Constructs a ShuffleNetV2 with 0.5x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
+    <https://arxiv.org/abs/1807.11164>`.
+    weight: https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth
+
+    :param num_classes:
+    :return:
+    """
+    model = ShuffleNetV2(stages_repeats=[4, 8, 4],
+                         stages_out_channels=[24, 48, 96, 192, 1024],
+                         num_classes=num_classes)
+
+    return model
+
+
 def shufflenet_v2_x1_0(num_classes=1000):
     """
     Constructs a ShuffleNetV2 with 1.0x output channels, as described in
@@ -164,18 +181,35 @@ def shufflenet_v2_x1_0(num_classes=1000):
     return model
 
 
-def shufflenet_v2_x0_5(num_classes=1000):
+def shufflenet_v2_x1_5(num_classes=1000):
     """
-    Constructs a ShuffleNetV2 with 0.5x output channels, as described in
+    Constructs a ShuffleNetV2 with 1.0x output channels, as described in
     `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
     <https://arxiv.org/abs/1807.11164>`.
-    weight: https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth
+    weight: https://download.pytorch.org/models/shufflenetv2_x1_5-3c479a10.pth
 
     :param num_classes:
     :return:
     """
     model = ShuffleNetV2(stages_repeats=[4, 8, 4],
-                         stages_out_channels=[24, 48, 96, 192, 1024],
+                         stages_out_channels=[24, 176, 352, 704, 1024],
+                         num_classes=num_classes)
+
+    return model
+
+
+def shufflenet_v2_x2_0(num_classes=1000):
+    """
+    Constructs a ShuffleNetV2 with 1.0x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
+    <https://arxiv.org/abs/1807.11164>`.
+    weight: https://download.pytorch.org/models/shufflenetv2_x2_0-8be3c8ee.pth
+
+    :param num_classes:
+    :return:
+    """
+    model = ShuffleNetV2(stages_repeats=[4, 8, 4],
+                         stages_out_channels=[24, 244, 488, 976, 2048],
                          num_classes=num_classes)
 
     return model
diff --git a/pytorch_classification/Test7_shufflenet/predict.py b/pytorch_classification/Test7_shufflenet/predict.py
index 2d62e6eac..8845b0a42 100644
--- a/pytorch_classification/Test7_shufflenet/predict.py
+++ b/pytorch_classification/Test7_shufflenet/predict.py
@@ -32,8 +32,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = shufflenet_v2_x1_0(num_classes=5).to(device)
diff --git a/pytorch_classification/Test7_shufflenet/train.py b/pytorch_classification/Test7_shufflenet/train.py
index 59e148eb3..1973a72fe 100644
--- a/pytorch_classification/Test7_shufflenet/train.py
+++ b/pytorch_classification/Test7_shufflenet/train.py
@@ -118,7 +118,7 @@ def main(args):
     parser.add_argument('--lrf', type=float, default=0.1)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str,
                         default="/data/flower_photos")
 
diff --git a/pytorch_classification/Test7_shufflenet/utils.py b/pytorch_classification/Test7_shufflenet/utils.py
index f4355900b..11f677974 100644
--- a/pytorch_classification/Test7_shufflenet/utils.py
+++ b/pytorch_classification/Test7_shufflenet/utils.py
@@ -16,7 +16,7 @@ def read_split_data(root: str, val_rate: float = 0.2):
 
     # 遍历文件夹，一个文件夹对应一个类别
     flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
-    # 排序，保证顺序一致
+    # 排序，保证各平台顺序一致
     flower_class.sort()
     # 生成类别名称以及对应的数字索引
     class_indices = dict((k, v) for v, k in enumerate(flower_class))
@@ -36,6 +36,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
         # 遍历获取supported支持的所有文件路径
         images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
                   if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
         # 获取该类别对应的索引
         image_class = class_indices[cla]
         # 记录该类别的样本数量
@@ -54,6 +56,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
     print("{} images were found in the dataset.".format(sum(every_class_num)))
     print("{} images for training.".format(len(train_images_path)))
     print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
 
     plot_image = False
     if plot_image:
diff --git a/pytorch_classification/Test8_densenet/README.md b/pytorch_classification/Test8_densenet/README.md
new file mode 100644
index 000000000..c93d9df0e
--- /dev/null
+++ b/pytorch_classification/Test8_densenet/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，在`model.py`文件中每个模型都有提供预训练权重的下载地址，根据自己使用的模型下载对应预训练权重
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/Test8_densenet/predict.py b/pytorch_classification/Test8_densenet/predict.py
index aa9d5d9ab..535358bee 100644
--- a/pytorch_classification/Test8_densenet/predict.py
+++ b/pytorch_classification/Test8_densenet/predict.py
@@ -32,8 +32,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = densenet121(num_classes=5).to(device)
diff --git a/pytorch_classification/Test8_densenet/train.py b/pytorch_classification/Test8_densenet/train.py
index 7f628c3d0..07b615dd0 100644
--- a/pytorch_classification/Test8_densenet/train.py
+++ b/pytorch_classification/Test8_densenet/train.py
@@ -115,7 +115,7 @@ def main(args):
     parser.add_argument('--lrf', type=float, default=0.1)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str,
                         default="/data/flower_photos")
 
diff --git a/pytorch_classification/Test8_densenet/utils.py b/pytorch_classification/Test8_densenet/utils.py
index f4355900b..11f677974 100644
--- a/pytorch_classification/Test8_densenet/utils.py
+++ b/pytorch_classification/Test8_densenet/utils.py
@@ -16,7 +16,7 @@ def read_split_data(root: str, val_rate: float = 0.2):
 
     # 遍历文件夹，一个文件夹对应一个类别
     flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
-    # 排序，保证顺序一致
+    # 排序，保证各平台顺序一致
     flower_class.sort()
     # 生成类别名称以及对应的数字索引
     class_indices = dict((k, v) for v, k in enumerate(flower_class))
@@ -36,6 +36,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
         # 遍历获取supported支持的所有文件路径
         images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
                   if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
         # 获取该类别对应的索引
         image_class = class_indices[cla]
         # 记录该类别的样本数量
@@ -54,6 +56,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
     print("{} images were found in the dataset.".format(sum(every_class_num)))
     print("{} images for training.".format(len(train_images_path)))
     print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
 
     plot_image = False
     if plot_image:
diff --git a/pytorch_classification/Test9_efficientNet/README.md b/pytorch_classification/Test9_efficientNet/README.md
new file mode 100644
index 000000000..24fb5021d
--- /dev/null
+++ b/pytorch_classification/Test9_efficientNet/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，根据自己使用的模型下载对应预训练权重: https://pan.baidu.com/s/1ouX0UmjCsmSx3ZrqXbowjw  密码: 090i
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/Test9_efficientNet/predict.py b/pytorch_classification/Test9_efficientNet/predict.py
index 56a278123..22f8e40c8 100644
--- a/pytorch_classification/Test9_efficientNet/predict.py
+++ b/pytorch_classification/Test9_efficientNet/predict.py
@@ -42,8 +42,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_model(num_classes=5).to(device)
diff --git a/pytorch_classification/Test9_efficientNet/train.py b/pytorch_classification/Test9_efficientNet/train.py
index 52f07a7fa..e20ec0692 100644
--- a/pytorch_classification/Test9_efficientNet/train.py
+++ b/pytorch_classification/Test9_efficientNet/train.py
@@ -129,7 +129,7 @@ def main(args):
     parser.add_argument('--lrf', type=float, default=0.01)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str,
                         default="/data/flower_photos")
 
diff --git a/pytorch_classification/Test9_efficientNet/utils.py b/pytorch_classification/Test9_efficientNet/utils.py
index f4355900b..11f677974 100644
--- a/pytorch_classification/Test9_efficientNet/utils.py
+++ b/pytorch_classification/Test9_efficientNet/utils.py
@@ -16,7 +16,7 @@ def read_split_data(root: str, val_rate: float = 0.2):
 
     # 遍历文件夹，一个文件夹对应一个类别
     flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
-    # 排序，保证顺序一致
+    # 排序，保证各平台顺序一致
     flower_class.sort()
     # 生成类别名称以及对应的数字索引
     class_indices = dict((k, v) for v, k in enumerate(flower_class))
@@ -36,6 +36,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
         # 遍历获取supported支持的所有文件路径
         images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
                   if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
         # 获取该类别对应的索引
         image_class = class_indices[cla]
         # 记录该类别的样本数量
@@ -54,6 +56,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
     print("{} images were found in the dataset.".format(sum(every_class_num)))
     print("{} images for training.".format(len(train_images_path)))
     print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
 
     plot_image = False
     if plot_image:
diff --git a/pytorch_classification/custom_dataset/main.py b/pytorch_classification/custom_dataset/main.py
index 632756c74..3f987787c 100644
--- a/pytorch_classification/custom_dataset/main.py
+++ b/pytorch_classification/custom_dataset/main.py
@@ -6,7 +6,7 @@
 from my_dataset import MyDataSet
 from utils import read_split_data, plot_data_loader_image
 
-# http://download.tensorflow.org/example_images/flower_photos.tgz
+# https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
 root = "/home/wz/my_github/data_set/flower_data/flower_photos"  # 数据集所在根目录
 
 
diff --git a/pytorch_classification/grad_cam/README.md b/pytorch_classification/grad_cam/README.md
index f17087ebf..328600e1d 100644
--- a/pytorch_classification/grad_cam/README.md
+++ b/pytorch_classification/grad_cam/README.md
@@ -1 +1,12 @@
-Original Impl: https://github.com/jacobgil/pytorch-grad-cam
+## Grad-CAM
+- Original Impl: [https://github.com/jacobgil/pytorch-grad-cam](https://github.com/jacobgil/pytorch-grad-cam)
+- Grad-CAM简介: [https://b23.tv/1kccjmb](https://b23.tv/1kccjmb)
+- 使用Pytorch实现Grad-CAM并绘制热力图: [https://b23.tv/n1e60vN](https://b23.tv/n1e60vN)
+
+## 使用流程(替换成自己的网络)
+1. 将创建模型部分代码替换成自己创建模型的代码，并载入自己训练好的权重
+2. 根据自己网络设置合适的`target_layers`
+3. 根据自己的网络设置合适的预处理方法
+4. 将要预测的图片路径赋值给`img_path`
+5. 将感兴趣的类别id赋值给`target_category`
+
diff --git a/pytorch_classification/grad_cam/main_cnn.py b/pytorch_classification/grad_cam/main_cnn.py
index 6e2cb2476..254f8e767 100644
--- a/pytorch_classification/grad_cam/main_cnn.py
+++ b/pytorch_classification/grad_cam/main_cnn.py
@@ -5,7 +5,7 @@
 import matplotlib.pyplot as plt
 from torchvision import models
 from torchvision import transforms
-from utils import GradCAM, show_cam_on_image
+from utils import GradCAM, show_cam_on_image, center_crop_img
 
 
 def main():
@@ -31,10 +31,12 @@ def main():
     assert os.path.exists(img_path), "file: '{}' dose not exist.".format(img_path)
     img = Image.open(img_path).convert('RGB')
     img = np.array(img, dtype=np.uint8)
+    # img = center_crop_img(img, 224)
 
-    # [N, C, H, W]
+    # [C, H, W]
     img_tensor = data_transform(img)
     # expand batch dimension
+    # [C, H, W] -> [N, C, H, W]
     input_tensor = torch.unsqueeze(img_tensor, dim=0)
 
     cam = GradCAM(model=model, target_layers=target_layers, use_cuda=False)
diff --git a/pytorch_classification/grad_cam/main_swin.py b/pytorch_classification/grad_cam/main_swin.py
index 600cea00b..292d2a30c 100644
--- a/pytorch_classification/grad_cam/main_swin.py
+++ b/pytorch_classification/grad_cam/main_swin.py
@@ -1,4 +1,5 @@
 import os
+import math
 import numpy as np
 import torch
 from PIL import Image
@@ -9,15 +10,19 @@
 
 
 class ResizeTransform:
-    def __init__(self, height=7, width=7):
-        self.height = height
-        self.width = width
+    def __init__(self, im_h: int, im_w: int):
+        self.height = self.feature_size(im_h)
+        self.width = self.feature_size(im_w)
+
+    @staticmethod
+    def feature_size(s):
+        s = math.ceil(s / 4)  # PatchEmbed
+        s = math.ceil(s / 2)  # PatchMerging1
+        s = math.ceil(s / 2)  # PatchMerging2
+        s = math.ceil(s / 2)  # PatchMerging3
+        return s
 
     def __call__(self, x):
-        if isinstance(x, tuple):
-            self.height = x[1]
-            self.width = x[2]
-            x = x[0]
         result = x.reshape(x.size(0),
                            self.height,
                            self.width,
@@ -25,18 +30,24 @@ def __call__(self, x):
 
         # Bring the channels to the first dimension,
         # like in CNNs.
-        result = result.transpose(2, 3).transpose(1, 2)
+        # [batch_size, H, W, C] -> [batch, C, H, W]
+        result = result.permute(0, 3, 1, 2)
 
         return result
 
 
 def main():
+    # 注意输入的图片必须是32的整数倍
+    # 否则由于padding的原因会出现注意力飘逸的问题
+    img_size = 224
+    assert img_size % 32 == 0
+
     model = swin_base_patch4_window7_224()
     # https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth
     weights_path = "./swin_base_patch4_window7_224.pth"
     model.load_state_dict(torch.load(weights_path, map_location="cpu")["model"], strict=False)
 
-    target_layers = [model.layers[-2]]
+    target_layers = [model.norm]
 
     data_transform = transforms.Compose([transforms.ToTensor(),
                                          transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
@@ -45,14 +56,16 @@ def main():
     assert os.path.exists(img_path), "file: '{}' dose not exist.".format(img_path)
     img = Image.open(img_path).convert('RGB')
     img = np.array(img, dtype=np.uint8)
-    img = center_crop_img(img, 224)
+    img = center_crop_img(img, img_size)
 
-    # [N, C, H, W]
+    # [C, H, W]
     img_tensor = data_transform(img)
     # expand batch dimension
+    # [C, H, W] -> [N, C, H, W]
     input_tensor = torch.unsqueeze(img_tensor, dim=0)
 
-    cam = GradCAM(model=model, target_layers=target_layers, use_cuda=False, reshape_transform=ResizeTransform())
+    cam = GradCAM(model=model, target_layers=target_layers, use_cuda=False,
+                  reshape_transform=ResizeTransform(im_h=img_size, im_w=img_size))
     target_category = 281  # tabby, tabby cat
     # target_category = 254  # pug, pug-dog
 
diff --git a/pytorch_classification/grad_cam/main_vit.py b/pytorch_classification/grad_cam/main_vit.py
index 8fc126b94..44a95c1fa 100644
--- a/pytorch_classification/grad_cam/main_vit.py
+++ b/pytorch_classification/grad_cam/main_vit.py
@@ -17,6 +17,7 @@ def __init__(self, model):
 
     def __call__(self, x):
         # remove cls token and reshape
+        # [batch_size, num_tokens, token_dim]
         result = x[:, 1:, :].reshape(x.size(0),
                                      self.h,
                                      self.w,
@@ -24,7 +25,8 @@ def __call__(self, x):
 
         # Bring the channels to the first dimension,
         # like in CNNs.
-        result = result.transpose(2, 3).transpose(1, 2)
+        # [batch_size, H, W, C] -> [batch, C, H, W]
+        result = result.permute(0, 3, 1, 2)
         return result
 
 
@@ -47,9 +49,10 @@ def main():
     img = Image.open(img_path).convert('RGB')
     img = np.array(img, dtype=np.uint8)
     img = center_crop_img(img, 224)
-    # [N, C, H, W]
+    # [C, H, W]
     img_tensor = data_transform(img)
     # expand batch dimension
+    # [C, H, W] -> [N, C, H, W]
     input_tensor = torch.unsqueeze(img_tensor, dim=0)
 
     cam = GradCAM(model=model,
diff --git a/pytorch_classification/grad_cam/utils.py b/pytorch_classification/grad_cam/utils.py
index 005e6c477..acbb0f4da 100644
--- a/pytorch_classification/grad_cam/utils.py
+++ b/pytorch_classification/grad_cam/utils.py
@@ -4,7 +4,7 @@
 
 class ActivationsAndGradients:
     """ Class for extracting activations and
-    registering gradients from targetted intermediate layers """
+    registering gradients from targeted intermediate layers """
 
     def __init__(self, model, target_layers, reshape_transform):
         self.model = model
@@ -16,7 +16,7 @@ def __init__(self, model, target_layers, reshape_transform):
             self.handles.append(
                 target_layer.register_forward_hook(
                     self.save_activation))
-            # Backward compatability with older pytorch versions:
+            # Backward compatibility with older pytorch versions:
             if hasattr(target_layer, 'register_full_backward_hook'):
                 self.handles.append(
                     target_layer.register_full_backward_hook(
@@ -70,7 +70,7 @@ def __init__(self,
 
     @staticmethod
     def get_cam_weights(grads):
-        return np.mean(grads, axis=(2, 3))
+        return np.mean(grads, axis=(2, 3), keepdims=True)
 
     @staticmethod
     def get_loss(output, target_category):
@@ -81,7 +81,7 @@ def get_loss(output, target_category):
 
     def get_cam_image(self, activations, grads):
         weights = self.get_cam_weights(grads)
-        weighted_activations = weights[:, :, None, None] * activations
+        weighted_activations = weights * activations
         cam = weighted_activations.sum(axis=1)
 
         return cam
diff --git a/pytorch_classification/swin_transformer/README.md b/pytorch_classification/swin_transformer/README.md
new file mode 100644
index 000000000..c93d9df0e
--- /dev/null
+++ b/pytorch_classification/swin_transformer/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，在`model.py`文件中每个模型都有提供预训练权重的下载地址，根据自己使用的模型下载对应预训练权重
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/swin_transformer/predict.py b/pytorch_classification/swin_transformer/predict.py
index 999fde040..26e95c584 100644
--- a/pytorch_classification/swin_transformer/predict.py
+++ b/pytorch_classification/swin_transformer/predict.py
@@ -33,8 +33,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_model(num_classes=5).to(device)
diff --git a/pytorch_classification/swin_transformer/train.py b/pytorch_classification/swin_transformer/train.py
index 047545cc9..845d77575 100644
--- a/pytorch_classification/swin_transformer/train.py
+++ b/pytorch_classification/swin_transformer/train.py
@@ -113,7 +113,7 @@ def main(args):
     parser.add_argument('--lr', type=float, default=0.0001)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str,
                         default="/data/flower_photos")
 
diff --git a/pytorch_classification/swin_transformer/utils.py b/pytorch_classification/swin_transformer/utils.py
index 96ad54a4b..23c53a06f 100644
--- a/pytorch_classification/swin_transformer/utils.py
+++ b/pytorch_classification/swin_transformer/utils.py
@@ -16,7 +16,7 @@ def read_split_data(root: str, val_rate: float = 0.2):
 
     # 遍历文件夹，一个文件夹对应一个类别
     flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
-    # 排序，保证顺序一致
+    # 排序，保证各平台顺序一致
     flower_class.sort()
     # 生成类别名称以及对应的数字索引
     class_indices = dict((k, v) for v, k in enumerate(flower_class))
@@ -36,6 +36,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
         # 遍历获取supported支持的所有文件路径
         images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
                   if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
         # 获取该类别对应的索引
         image_class = class_indices[cla]
         # 记录该类别的样本数量
@@ -54,6 +56,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
     print("{} images were found in the dataset.".format(sum(every_class_num)))
     print("{} images for training.".format(len(train_images_path)))
     print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
 
     plot_image = False
     if plot_image:
diff --git a/pytorch_classification/tensorboard_test/requirements.txt b/pytorch_classification/tensorboard_test/requirements.txt
index 15ba10ebe..c42b25958 100644
--- a/pytorch_classification/tensorboard_test/requirements.txt
+++ b/pytorch_classification/tensorboard_test/requirements.txt
@@ -1,6 +1,6 @@
 torchvision==0.7.0
 tqdm==4.42.1
 matplotlib==3.2.1
-torch==1.6.0
+torch==1.13.1
 Pillow
 tensorboard
diff --git a/pytorch_classification/tensorboard_test/train.py b/pytorch_classification/tensorboard_test/train.py
index a2382e8da..25482b58b 100644
--- a/pytorch_classification/tensorboard_test/train.py
+++ b/pytorch_classification/tensorboard_test/train.py
@@ -150,7 +150,7 @@ def main(args):
     parser.add_argument('--lrf', type=float, default=0.1)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     img_root = "/home/wz/my_project/my_github/data_set/flower_data/flower_photos"
     parser.add_argument('--data-path', type=str, default=img_root)
 
diff --git a/pytorch_classification/train_multi_GPU/requirements.txt b/pytorch_classification/train_multi_GPU/requirements.txt
index f5fd2973e..a31687d0e 100644
--- a/pytorch_classification/train_multi_GPU/requirements.txt
+++ b/pytorch_classification/train_multi_GPU/requirements.txt
@@ -1,4 +1,4 @@
 matplotlib==3.2.1
 tqdm==4.42.1
 torchvision==0.7.0
-torch==1.6.0
+torch==1.13.1
diff --git a/pytorch_classification/train_multi_GPU/train_multi_gpu_using_launch.py b/pytorch_classification/train_multi_GPU/train_multi_gpu_using_launch.py
index 6c5a84a31..944db144a 100644
--- a/pytorch_classification/train_multi_GPU/train_multi_gpu_using_launch.py
+++ b/pytorch_classification/train_multi_GPU/train_multi_gpu_using_launch.py
@@ -28,6 +28,7 @@ def main(args):
     batch_size = args.batch_size
     weights_path = args.weights
     args.lr *= args.world_size  # 学习率要根据并行GPU的数量进行倍增
+    checkpoint_path = ""
 
     if rank == 0:  # 在第一个进程中打印信息，并实例化tensorboard
         print(args)
@@ -172,7 +173,7 @@ def main(args):
     parser.add_argument('--syncBN', type=bool, default=True)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str, default="/home/wz/data_set/flower_data/flower_photos")
 
     # resnet34 官方权重下载地址
diff --git a/pytorch_classification/train_multi_GPU/train_multi_gpu_using_spawn.py b/pytorch_classification/train_multi_GPU/train_multi_gpu_using_spawn.py
index 3aa3c7e28..1f8f9a564 100644
--- a/pytorch_classification/train_multi_GPU/train_multi_gpu_using_spawn.py
+++ b/pytorch_classification/train_multi_GPU/train_multi_gpu_using_spawn.py
@@ -46,6 +46,7 @@ def main_fun(rank, world_size, args):
     batch_size = args.batch_size
     weights_path = args.weights
     args.lr *= args.world_size  # 学习率要根据并行GPU的数量进行倍增
+    checkpoint_path = ""
 
     if rank == 0:  # 在第一个进程中打印信息，并实例化tensorboard
         print(args)
@@ -191,7 +192,7 @@ def main_fun(rank, world_size, args):
     parser.add_argument('--syncBN', type=bool, default=True)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str, default="/home/wz/data_set/flower_data/flower_photos")
 
     # resnet34 官方权重下载地址
diff --git a/pytorch_classification/train_multi_GPU/train_single_gpu.py b/pytorch_classification/train_multi_GPU/train_single_gpu.py
index bc0fd7312..ce9df27ae 100644
--- a/pytorch_classification/train_multi_GPU/train_single_gpu.py
+++ b/pytorch_classification/train_multi_GPU/train_single_gpu.py
@@ -125,7 +125,7 @@ def main(args):
     parser.add_argument('--lrf', type=float, default=0.1)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str,
                         default="/home/w180662/my_project/my_github/data_set/flower_data/flower_photos")
 
diff --git a/pytorch_classification/train_multi_GPU/utils.py b/pytorch_classification/train_multi_GPU/utils.py
index 5365d4ef2..54b2c7d18 100644
--- a/pytorch_classification/train_multi_GPU/utils.py
+++ b/pytorch_classification/train_multi_GPU/utils.py
@@ -12,7 +12,7 @@ def read_split_data(root: str, val_rate: float = 0.2):
 
     # 遍历文件夹，一个文件夹对应一个类别
     class_names = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
-    # 排序，保证顺序一致
+    # 排序，保证各平台顺序一致
     class_names.sort()
     # 生成类别名称以及对应的数字索引
     class_indices = dict((k, v) for v, k in enumerate(class_names))
@@ -32,6 +32,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
         # 遍历获取supported支持的所有文件路径
         images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
                   if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
         # 获取该类别对应的索引
         image_class = class_indices[cla]
         # 记录该类别的样本数量
@@ -50,6 +52,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
     print("{} images were found in the dataset.".format(sum(every_class_num)))
     print("{} images for training.".format(len(train_images_path)))
     print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
 
     plot_image = False
     if plot_image:
diff --git a/pytorch_classification/vision_transformer/README.md b/pytorch_classification/vision_transformer/README.md
new file mode 100644
index 000000000..4b700b2df
--- /dev/null
+++ b/pytorch_classification/vision_transformer/README.md
@@ -0,0 +1,12 @@
+## 代码使用简介
+
+1. 下载好数据集，代码中默认使用的是花分类数据集，下载地址: [https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz](https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz),
+如果下载不了的话可以通过百度云链接下载: https://pan.baidu.com/s/1QLCTA4sXnQAw_yvxPj9szg 提取码:58p0
+2. 在`train.py`脚本中将`--data-path`设置成解压后的`flower_photos`文件夹绝对路径
+3. 下载预训练权重，在`vit_model.py`文件中每个模型都有提供预训练权重的下载地址，根据自己使用的模型下载对应预训练权重
+4. 在`train.py`脚本中将`--weights`参数设成下载好的预训练权重路径
+5. 设置好数据集的路径`--data-path`以及预训练权重的路径`--weights`就能使用`train.py`脚本开始训练了(训练过程中会自动生成`class_indices.json`文件)
+6. 在`predict.py`脚本中导入和训练脚本中同样的模型，并将`model_weight_path`设置成训练好的模型权重路径(默认保存在weights文件夹下)
+7. 在`predict.py`脚本中将`img_path`设置成你自己需要预测的图片绝对路径
+8. 设置好权重路径`model_weight_path`和预测的图片路径`img_path`就能使用`predict.py`脚本进行预测了
+9. 如果要使用自己的数据集，请按照花分类数据集的文件结构进行摆放(即一个类别对应一个文件夹)，并且将训练以及预测脚本中的`num_classes`设置成你自己数据的类别数
diff --git a/pytorch_classification/vision_transformer/predict.py b/pytorch_classification/vision_transformer/predict.py
index fad2d117a..1c4c7fe30 100644
--- a/pytorch_classification/vision_transformer/predict.py
+++ b/pytorch_classification/vision_transformer/predict.py
@@ -32,8 +32,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_model(num_classes=5, has_logits=False).to(device)
diff --git a/pytorch_classification/vision_transformer/train.py b/pytorch_classification/vision_transformer/train.py
index 6c1e7cda5..66bb1d296 100644
--- a/pytorch_classification/vision_transformer/train.py
+++ b/pytorch_classification/vision_transformer/train.py
@@ -61,7 +61,7 @@ def main(args):
                                              num_workers=nw,
                                              collate_fn=val_dataset.collate_fn)
 
-    model = create_model(num_classes=5, has_logits=False).to(device)
+    model = create_model(num_classes=args.num_classes, has_logits=False).to(device)
 
     if args.weights != "":
         assert os.path.exists(args.weights), "weights file: '{}' not exist.".format(args.weights)
@@ -122,7 +122,7 @@ def main(args):
     parser.add_argument('--lrf', type=float, default=0.01)
 
     # 数据集所在根目录
-    # http://download.tensorflow.org/example_images/flower_photos.tgz
+    # https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
     parser.add_argument('--data-path', type=str,
                         default="/data/flower_photos")
     parser.add_argument('--model-name', default='', help='create model name')
diff --git a/pytorch_classification/vision_transformer/utils.py b/pytorch_classification/vision_transformer/utils.py
index 96ad54a4b..23c53a06f 100644
--- a/pytorch_classification/vision_transformer/utils.py
+++ b/pytorch_classification/vision_transformer/utils.py
@@ -16,7 +16,7 @@ def read_split_data(root: str, val_rate: float = 0.2):
 
     # 遍历文件夹，一个文件夹对应一个类别
     flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
-    # 排序，保证顺序一致
+    # 排序，保证各平台顺序一致
     flower_class.sort()
     # 生成类别名称以及对应的数字索引
     class_indices = dict((k, v) for v, k in enumerate(flower_class))
@@ -36,6 +36,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
         # 遍历获取supported支持的所有文件路径
         images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
                   if os.path.splitext(i)[-1] in supported]
+        # 排序，保证各平台顺序一致
+        images.sort()
         # 获取该类别对应的索引
         image_class = class_indices[cla]
         # 记录该类别的样本数量
@@ -54,6 +56,8 @@ def read_split_data(root: str, val_rate: float = 0.2):
     print("{} images were found in the dataset.".format(sum(every_class_num)))
     print("{} images for training.".format(len(train_images_path)))
     print("{} images for validation.".format(len(val_images_path)))
+    assert len(train_images_path) > 0, "number of training images must greater than 0."
+    assert len(val_images_path) > 0, "number of validation images must greater than 0."
 
     plot_image = False
     if plot_image:
diff --git a/pytorch_keypoint/DeepPose/README.md b/pytorch_keypoint/DeepPose/README.md
new file mode 100644
index 000000000..9d0a54d79
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/README.md
@@ -0,0 +1,68 @@
+# DeepPose
+## 对应论文
+论文名称：`DeepPose: Human Pose Estimation via Deep Neural Networks`  
+论文arxiv链接：[https://arxiv.org/abs/1312.4659](https://arxiv.org/abs/1312.4659)
+
+## 开发环境
+开发环境主要信息如下，其他Python依赖详情可见`requirements.txt`文件
+- Python3.10
+- torch2.0.1+cu118（建议大于等于此版本）
+- torchvision0.15.2+cu118（建议大于等于此版本）
+
+## 训练数据集准备
+该项目采用的训练数据是WFLW数据集（人脸98点检测），官方链接：[https://wywu.github.io/projects/LAB/WFLW.html](https://wywu.github.io/projects/LAB/WFLW.html)
+
+在官方网页下载数据集后解压并组织成如下目录形式：
+```
+WFLW
+ ├── WFLW_annotations
+ │   ├── list_98pt_rect_attr_train_test
+ │   └── list_98pt_test
+ └── WFLW_images
+     ├── 0--Parade
+     ├── 1--Handshaking
+     ├── 10--People_Marching
+     ├── 11--Meeting
+     ├── 12--Group
+     └── ......
+```
+
+## 预训练权重准备
+由于该项目默认使用的backbone是torchvision中的resnet50，在实例化模型时会自动下载在imagenet上的预训练权重。
+- 若训练环境可正常联网，则会自动下载预训练权重
+- 若训练环境无法正常链接网络，可预先在联网的机器上手动下载，下载链接：[https://download.pytorch.org/models/resnet50-11ad3fa6.pth](https://download.pytorch.org/models/resnet50-11ad3fa6.pth) 下载完成后将权重拷贝至训练服务器的`~/.cache/torch/hub/checkpoints`目录下即可
+
+## 启动训练
+将训练脚本中的`--dataset_dir`设置成自己构建的`WFLW`数据集绝对路径，例如`/home/wz/datasets/WFLW`
+### 单卡训练
+使用`train.py`脚本：
+```bash
+python train.py
+```
+### 多卡训练
+使用`train_multi_GPU.py`脚本：
+```
+torchrun --nproc_per_node=8 train_multi_GPU.py
+```
+若要单独指定使用某些卡可在启动指令前加入`CUDA_VISIBLE_DEVICES`参数，例如：
+```
+CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun --nproc_per_node=4 train_multi_GPU.py
+```
+
+## 训练好的权重下载地址
+若没有训练条件或者只想简单体验下，可使用本人训练好的模型权重（包含optimizer等信息故文件会略大），该权重在WFLW验证集上的NME指标为`0.048`，百度网盘下载地址：[https://pan.baidu.com/s/1L_zg-fmocEyzhSTxj8IDJw](https://pan.baidu.com/s/1L_zg-fmocEyzhSTxj8IDJw) 
+提取码：8fux
+
+下载完成后在当前项目下创建一个`weights`文件夹，并将权重放置该文件夹内。
+
+## 测试图片
+可参考`predict.py`文件，将`img_path`设置成自己要预测的人脸图片（注意这里只支持单人脸的关键点检测，故需要提供单独的人脸图片，具体使用时可配合一个人脸检测器联合使用），例如输入图片：
+
+![test.jpg](./test_img.jpg)
+
+网络预测可视化结果为：
+
+![predict.jpg](./predict.jpg)
+
+## 导出ONNX模型（可选）
+若需要导出ONNX模型可使用`export_onnx.py`脚本。
\ No newline at end of file
diff --git a/pytorch_keypoint/DeepPose/datasets.py b/pytorch_keypoint/DeepPose/datasets.py
new file mode 100644
index 000000000..7e79cef12
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/datasets.py
@@ -0,0 +1,121 @@
+import os
+from typing import List, Tuple
+
+import cv2
+import torch
+import torch.utils.data as data
+import numpy as np
+
+
+class WFLWDataset(data.Dataset):
+    """
+    https://wywu.github.io/projects/LAB/WFLW.html
+
+    dataset structure:
+
+    ├── WFLW_annotations
+    │   ├── list_98pt_rect_attr_train_test
+    │   └── list_98pt_test
+    └── WFLW_images
+        ├── 0--Parade
+        ├── 1--Handshaking
+        ├── 10--People_Marching
+        ├── 11--Meeting
+        ├── 12--Group
+        └── ......
+    """
+    def __init__(self,
+                 root: str,
+                 train: bool = True,
+                 transforms=None):
+        super().__init__()
+        self.img_root = os.path.join(root, "WFLW_images")
+        assert os.path.exists(self.img_root), "path '{}' does not exist.".format(self.img_root)
+        ana_txt_name = "list_98pt_rect_attr_train.txt" if train else "list_98pt_rect_attr_test.txt"
+        self.anno_path = os.path.join(root, "WFLW_annotations", "list_98pt_rect_attr_train_test", ana_txt_name)
+        assert os.path.exists(self.anno_path), "file '{}' does not exist.".format(self.anno_path)
+
+        self.transforms = transforms
+        self.keypoints: List[np.ndarray] = []
+        self.face_rects: List[List[int]] = []
+        self.img_paths: List[str] = []
+        with open(self.anno_path, "rt") as f:
+            for line in f.readlines():
+                if not line.strip():
+                    continue
+
+                split_list = line.strip().split(" ")
+                keypoint_ = self.get_98_points(split_list)
+                keypoint = np.array(keypoint_, dtype=np.float32).reshape((-1, 2))
+                face_rect = list(map(int, split_list[196: 196 + 4]))  # xmin, ymin, xmax, ymax
+                img_name = split_list[-1]
+
+                self.keypoints.append(keypoint)
+                self.face_rects.append(face_rect)
+                self.img_paths.append(os.path.join(self.img_root, img_name))
+
+    @staticmethod
+    def get_5_points(keypoints: List[str]) -> List[float]:
+        five_num = [76, 82, 54, 96, 97]
+        five_keypoint = []
+        for i in five_num:
+            five_keypoint.append(keypoints[i * 2])
+            five_keypoint.append(keypoints[i * 2 + 1])
+        return list(map(float, five_keypoint))
+
+    @staticmethod
+    def get_98_points(keypoints: List[str]) -> List[float]:
+        return list(map(float, keypoints[:196]))
+
+    @staticmethod
+    def collate_fn(batch_infos: List[Tuple[torch.Tensor, dict]]):
+        imgs, ori_keypoints, keypoints, m_invs = [], [], [], []
+        for info in batch_infos:
+            imgs.append(info[0])
+            ori_keypoints.append(info[1]["ori_keypoint"])
+            keypoints.append(info[1]["keypoint"])
+            m_invs.append(info[1]["m_inv"])
+
+        imgs_tensor = torch.stack(imgs)
+        keypoints_tensor = torch.stack(keypoints)
+        ori_keypoints_tensor = torch.stack(ori_keypoints)
+        m_invs_tensor = torch.stack(m_invs)
+
+        targets = {"ori_keypoints": ori_keypoints_tensor,
+                   "keypoints": keypoints_tensor,
+                   "m_invs": m_invs_tensor}
+        return imgs_tensor, targets
+
+    def __getitem__(self, idx: int):
+        img_bgr = cv2.imread(self.img_paths[idx], flags=cv2.IMREAD_COLOR)
+        img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+
+        target = {
+            "box": self.face_rects[idx],
+            "ori_keypoint": self.keypoints[idx],
+            "keypoint": self.keypoints[idx]
+        }
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.keypoints)
+
+
+if __name__ == '__main__':
+    train_dataset = WFLWDataset("/home/wz/datasets/WFLW", train=True)
+    print(len(train_dataset))
+
+    eval_dataset = WFLWDataset("/home/wz/datasets/WFLW", train=False)
+    print(len(eval_dataset))
+
+    from utils import draw_keypoints
+    img, target = train_dataset[0]
+    keypoint = target["keypoint"]
+    h, w, c = img.shape
+    keypoint[:, 0] /= w
+    keypoint[:, 1] /= h
+    draw_keypoints(img, keypoint, "test_plot.jpg", is_rel=True)
diff --git a/pytorch_keypoint/DeepPose/export_onnx.py b/pytorch_keypoint/DeepPose/export_onnx.py
new file mode 100644
index 000000000..3d44dc37e
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/export_onnx.py
@@ -0,0 +1,29 @@
+import os
+import torch
+from model import create_deep_pose_model
+
+
+def main():
+    img_hw = [256, 256]
+    num_keypoints = 98
+    weights_path = "./weights/model_weights_209.pth"
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    # create model
+    model = create_deep_pose_model(num_keypoints=num_keypoints)
+
+    # load model weights
+    assert os.path.exists(weights_path), "file: '{}' dose not exist.".format(weights_path)
+    model.load_state_dict(torch.load(weights_path, map_location="cpu")["model"])
+    model.to(device)
+
+    model.eval()
+    with torch.inference_mode():
+        x = torch.randn(size=(1, 3, img_hw[0], img_hw[1]), device=device)
+        torch.onnx.export(model=model,
+                          args=(x,),
+                          f="deeppose.onnx")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pytorch_keypoint/DeepPose/model.py b/pytorch_keypoint/DeepPose/model.py
new file mode 100644
index 000000000..1d5abdfb2
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/model.py
@@ -0,0 +1,21 @@
+import torch
+import torch.nn as nn
+from torchvision.models import resnet50, ResNet50_Weights
+
+
+def create_deep_pose_model(num_keypoints: int) -> nn.Module:
+    res50 = resnet50(ResNet50_Weights.IMAGENET1K_V2)
+    in_features = res50.fc.in_features
+    res50.fc = nn.Linear(in_features=in_features, out_features=num_keypoints * 2)
+
+    return res50
+
+
+if __name__ == '__main__':
+    torch.manual_seed(1234)
+    model = create_deep_pose_model(98)
+    model.eval()
+    with torch.inference_mode():
+        x = torch.randn(1, 3, 224, 224)
+        res = model(x)
+        print(res.shape)
diff --git a/pytorch_keypoint/DeepPose/predict.jpg b/pytorch_keypoint/DeepPose/predict.jpg
new file mode 100644
index 000000000..2107a2fe7
Binary files /dev/null and b/pytorch_keypoint/DeepPose/predict.jpg differ
diff --git a/pytorch_keypoint/DeepPose/predict.py b/pytorch_keypoint/DeepPose/predict.py
new file mode 100644
index 000000000..a12a60d6c
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/predict.py
@@ -0,0 +1,55 @@
+import os
+
+import torch
+import numpy as np
+from PIL import Image
+
+import transforms
+from model import create_deep_pose_model
+from utils import draw_keypoints
+
+
+def main():
+    img_hw = [256, 256]
+    num_keypoints = 98
+    img_path = "./test_img.jpg"
+    weights_path = "./weights/model_weights_209.pth"
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    transform = transforms.Compose([
+        transforms.AffineTransform(scale_prob=0., rotate_prob=0., shift_prob=0., fixed_size=img_hw),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+
+    # load image
+    assert os.path.exists(img_path), "file: '{}' dose not exist.".format(img_path)
+    img = np.array(Image.open(img_path))
+    h, w, c = img.shape
+    target = {"box": [0, 0, w, h]}
+    img_tensor, target = transform(img, target=target)
+    # expand batch dimension
+    img_tensor = img_tensor.unsqueeze(0)
+
+    # create model
+    model = create_deep_pose_model(num_keypoints=num_keypoints)
+
+    # load model weights
+    assert os.path.exists(weights_path), "file: '{}' dose not exist.".format(weights_path)
+    model.load_state_dict(torch.load(weights_path, map_location="cpu")["model"])
+    model.to(device)
+
+    # prediction
+    model.eval()
+    with torch.inference_mode():
+        with torch.autocast(device_type=device.type):
+            pred = torch.squeeze(model(img_tensor.to(device))).reshape([-1, 2]).cpu().numpy()
+
+        wh_tensor = np.array(img_hw[::-1], dtype=np.float32).reshape([1, 2])
+        pred = pred * wh_tensor  # rel coord to abs coord
+        pred = transforms.affine_points_np(pred, target["m_inv"].numpy())
+        draw_keypoints(img, coordinate=pred, save_path="predict.jpg", radius=2)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pytorch_keypoint/DeepPose/requirements.txt b/pytorch_keypoint/DeepPose/requirements.txt
new file mode 100644
index 000000000..385ffc3f2
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/requirements.txt
@@ -0,0 +1,5 @@
+torch>=2.0.1
+torchvision>=0.15.2
+opencv-python
+tqdm
+tensorboard
\ No newline at end of file
diff --git a/pytorch_keypoint/DeepPose/test_img.jpg b/pytorch_keypoint/DeepPose/test_img.jpg
new file mode 100644
index 000000000..6388b49ed
Binary files /dev/null and b/pytorch_keypoint/DeepPose/test_img.jpg differ
diff --git a/pytorch_keypoint/DeepPose/train.py b/pytorch_keypoint/DeepPose/train.py
new file mode 100644
index 000000000..4d8e108f6
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/train.py
@@ -0,0 +1,176 @@
+import os
+
+import torch
+import torch.amp
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+import transforms
+from model import create_deep_pose_model
+from datasets import WFLWDataset
+from train_utils.train_eval_utils import train_one_epoch, evaluate
+
+
+def get_args_parser(add_help=True):
+    import argparse
+
+    parser = argparse.ArgumentParser(description="PyTorch DeepPose Training", add_help=add_help)
+    parser.add_argument("--dataset_dir", type=str, default="/home/wz/datasets/WFLW", help="WFLW dataset directory")
+    parser.add_argument("--device", type=str, default="cuda:0", help="training device, e.g. cpu, cuda:0")
+    parser.add_argument("--save_weights_dir", type=str, default="./weights", help="save dir for model weights")
+    parser.add_argument("--save_freq", type=int, default=10, help="save frequency for weights and generated imgs")
+    parser.add_argument("--eval_freq", type=int, default=5, help="evaluate frequency")
+    parser.add_argument('--img_hw', default=[256, 256], nargs='+', type=int, help='training image size[h, w]')
+    parser.add_argument("--epochs", type=int, default=210, help="number of epochs of training")
+    parser.add_argument("--batch_size", type=int, default=32, help="size of the batches")
+    parser.add_argument("--num_workers", type=int, default=8, help="number of workers, default: 8")
+    parser.add_argument("--num_keypoints", type=int, default=98, help="number of keypoints")
+    parser.add_argument("--lr", type=float, default=5e-4, help="Adam: learning rate")
+    parser.add_argument('--lr_steps', default=[170, 200], nargs='+', type=int,
+                        help='decrease lr every step-size epochs')
+    parser.add_argument("--warmup_epoch", type=int, default=10, help="number of warmup epoch for training")
+    parser.add_argument('--resume', default='', type=str, help='resume from checkpoint')
+    parser.add_argument('--test_only', action="http://github.com/store_true", help='Only test the model')
+
+    return parser
+
+
+def main(args):
+    torch.manual_seed(1234)
+    dataset_dir = args.dataset_dir
+    save_weights_dir = args.save_weights_dir
+    save_freq = args.save_freq
+    eval_freq = args.eval_freq
+    num_keypoints = args.num_keypoints
+    num_workers = args.num_workers
+    epochs = args.epochs
+    bs = args.batch_size
+    start_epoch = 0
+    img_hw = args.img_hw
+    os.makedirs(save_weights_dir, exist_ok=True)
+
+    if "cuda" in args.device and not torch.cuda.is_available():
+        device = torch.device("cpu")
+    else:
+        device = torch.device(args.device)
+    print(f"using device: {device} for training.")
+
+    # tensorboard writer
+    tb_writer = SummaryWriter()
+
+    # create model
+    model = create_deep_pose_model(num_keypoints)
+    model.to(device)
+
+    # config dataset and dataloader
+    data_transform = {
+        "train": transforms.Compose([
+            transforms.AffineTransform(scale_factor=(0.65, 1.35), rotate=45, shift_factor=0.15, fixed_size=img_hw),
+            transforms.RandomHorizontalFlip(0.5),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ]),
+        "val": transforms.Compose([
+            transforms.AffineTransform(scale_prob=0., rotate_prob=0., shift_prob=0., fixed_size=img_hw),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+    }
+    train_dataset = WFLWDataset(root=dataset_dir,
+                                train=True,
+                                transforms=data_transform["train"])
+    val_dataset = WFLWDataset(root=dataset_dir,
+                              train=False,
+                              transforms=data_transform["val"])
+
+    train_loader = DataLoader(train_dataset,
+                              batch_size=bs,
+                              shuffle=True,
+                              pin_memory=True,
+                              num_workers=num_workers,
+                              collate_fn=WFLWDataset.collate_fn,
+                              persistent_workers=True)
+
+    val_loader = DataLoader(val_dataset,
+                            batch_size=bs,
+                            shuffle=False,
+                            pin_memory=True,
+                            num_workers=num_workers,
+                            collate_fn=WFLWDataset.collate_fn,
+                            persistent_workers=True)
+
+    # define optimizers
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+
+    # define learning rate scheduler
+    warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
+        optimizer=optimizer,
+        start_factor=0.01,
+        end_factor=1.0,
+        total_iters=len(train_loader) * args.warmup_epoch
+    )
+    multi_step_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+        optimizer=optimizer,
+        milestones=[len(train_loader) * i for i in args.lr_steps],
+        gamma=0.1
+    )
+
+    lr_scheduler = torch.optim.lr_scheduler.ChainedScheduler([warmup_scheduler, multi_step_scheduler])
+
+    if args.resume:
+        assert os.path.exists(args.resume)
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        start_epoch = checkpoint['epoch'] + 1
+        print("the training process from epoch{}...".format(start_epoch))
+
+    if args.test_only:
+        evaluate(model=model,
+                 epoch=start_epoch,
+                 val_loader=val_loader,
+                 device=device,
+                 tb_writer=tb_writer,
+                 affine_points_torch_func=transforms.affine_points_torch,
+                 num_keypoints=num_keypoints,
+                 img_hw=img_hw)
+        return
+
+    for epoch in range(start_epoch, epochs):
+        # train
+        train_one_epoch(model=model,
+                        epoch=epoch,
+                        train_loader=train_loader,
+                        device=device,
+                        optimizer=optimizer,
+                        lr_scheduler=lr_scheduler,
+                        tb_writer=tb_writer,
+                        num_keypoints=num_keypoints,
+                        img_hw=img_hw)
+
+        # eval
+        if epoch % eval_freq == 0 or epoch == args.epochs - 1:
+            evaluate(model=model,
+                     epoch=epoch,
+                     val_loader=val_loader,
+                     device=device,
+                     tb_writer=tb_writer,
+                     affine_points_torch_func=transforms.affine_points_torch,
+                     num_keypoints=num_keypoints,
+                     img_hw=img_hw)
+
+        # save weights
+        if epoch % save_freq == 0 or epoch == args.epochs - 1:
+            save_files = {
+                'model': model.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'lr_scheduler': lr_scheduler.state_dict(),
+                'epoch': epoch
+            }
+            torch.save(save_files, os.path.join(save_weights_dir, f"model_weights_{epoch}.pth"))
+
+
+if __name__ == '__main__':
+    args = get_args_parser().parse_args()
+    main(args)
diff --git a/pytorch_keypoint/DeepPose/train_multi_GPU.py b/pytorch_keypoint/DeepPose/train_multi_GPU.py
new file mode 100644
index 000000000..d1d1c2f9a
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/train_multi_GPU.py
@@ -0,0 +1,188 @@
+import os
+
+import torch
+import torch.amp
+from torch.utils.data import DataLoader, DistributedSampler, BatchSampler
+from torch.utils.tensorboard import SummaryWriter
+
+import transforms
+from model import create_deep_pose_model
+from datasets import WFLWDataset
+from train_utils.train_eval_utils import train_one_epoch, evaluate
+from train_utils.distributed_utils import init_distributed_mode, is_main_process
+
+
+def get_args_parser(add_help=True):
+    import argparse
+
+    parser = argparse.ArgumentParser(description="PyTorch DeepPose Training", add_help=add_help)
+    parser.add_argument("--dataset_dir", type=str, default="/home/wz/datasets/WFLW", help="WFLW dataset directory")
+    parser.add_argument("--device", type=str, default="cuda", help="training device, e.g. cpu, cuda")
+    parser.add_argument("--save_weights_dir", type=str, default="./weights", help="save dir for model weights")
+    parser.add_argument("--save_freq", type=int, default=5, help="save frequency for weights and generated imgs")
+    parser.add_argument("--eval_freq", type=int, default=5, help="evaluate frequency")
+    parser.add_argument('--img_hw', default=[256, 256], nargs='+', type=int, help='training image size[h, w]')
+    parser.add_argument("--epochs", type=int, default=210, help="number of epochs of training")
+    parser.add_argument("--batch_size", type=int, default=32, help="size of the batches")
+    parser.add_argument("--num_workers", type=int, default=8, help="number of workers, default: 8")
+    parser.add_argument("--num_keypoints", type=int, default=98, help="number of keypoints")
+    parser.add_argument("--lr", type=float, default=5e-4, help="Adam: learning rate")
+    parser.add_argument('--lr_steps', default=[170, 200], nargs='+', type=int,
+                        help='decrease lr every step-size epochs')
+    parser.add_argument("--warmup_epoch", type=int, default=10, help="number of warmup epoch for training")
+    parser.add_argument('--resume', default='', type=str, help='resume from checkpoint')
+    parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+    parser.add_argument('--test_only', action="http://github.com/store_true", help='Only test the model')
+
+    return parser
+
+
+def main(args):
+    torch.manual_seed(1234)
+    init_distributed_mode(args)
+    if not args.distributed:
+        raise EnvironmentError("not support distributed training.")
+
+    dataset_dir = args.dataset_dir
+    save_weights_dir = args.save_weights_dir
+    save_freq = args.save_freq
+    eval_freq = args.eval_freq
+    num_keypoints = args.num_keypoints
+    num_workers = args.num_workers
+    epochs = args.epochs
+    bs = args.batch_size
+    start_epoch = 0
+    img_hw = args.img_hw
+    device = torch.device(args.device)
+    os.makedirs(save_weights_dir, exist_ok=True)
+
+    # adjust learning rate
+    args.lr = args.lr * args.world_size
+
+    tb_writer = None
+    if is_main_process():
+        # tensorboard writer
+        tb_writer = SummaryWriter()
+
+    # create model
+    model = create_deep_pose_model(num_keypoints)
+    model.to(device)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+
+    # config dataset and dataloader
+    data_transform = {
+        "train": transforms.Compose([
+            transforms.AffineTransform(scale_factor=(0.65, 1.35), rotate=45, shift_factor=0.15, fixed_size=img_hw),
+            transforms.RandomHorizontalFlip(0.5),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ]),
+        "val": transforms.Compose([
+            transforms.AffineTransform(scale_prob=0., rotate_prob=0., shift_prob=0., fixed_size=img_hw),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+    }
+    train_dataset = WFLWDataset(root=dataset_dir,
+                                train=True,
+                                transforms=data_transform["train"])
+    val_dataset = WFLWDataset(root=dataset_dir,
+                              train=False,
+                              transforms=data_transform["val"])
+
+    train_sampler = DistributedSampler(train_dataset)
+    val_sampler = DistributedSampler(val_dataset)
+    train_batch_sampler = BatchSampler(train_sampler, args.batch_size, drop_last=True)
+
+    train_loader = DataLoader(train_dataset,
+                              batch_sampler=train_batch_sampler,
+                              pin_memory=True,
+                              num_workers=num_workers,
+                              collate_fn=WFLWDataset.collate_fn,
+                              persistent_workers=True)
+
+    val_loader = DataLoader(val_dataset,
+                            batch_size=bs,
+                            sampler=val_sampler,
+                            shuffle=False,
+                            pin_memory=True,
+                            num_workers=num_workers,
+                            collate_fn=WFLWDataset.collate_fn,
+                            persistent_workers=True)
+
+    # define optimizers
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+
+    # define learning rate scheduler
+    warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
+        optimizer=optimizer,
+        start_factor=0.01,
+        end_factor=1.0,
+        total_iters=len(train_loader) * args.warmup_epoch
+    )
+    multi_step_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+        optimizer=optimizer,
+        milestones=[len(train_loader) * i for i in args.lr_steps],
+        gamma=0.1
+    )
+
+    lr_scheduler = torch.optim.lr_scheduler.ChainedScheduler([warmup_scheduler, multi_step_scheduler])
+
+    if args.resume:
+        assert os.path.exists(args.resume)
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model.module.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        start_epoch = checkpoint['epoch'] + 1
+        print("the training process from epoch{}...".format(start_epoch))
+
+    if args.test_only:
+        evaluate(model=model,
+                 epoch=start_epoch,
+                 val_loader=val_loader,
+                 device=device,
+                 tb_writer=tb_writer,
+                 affine_points_torch_func=transforms.affine_points_torch,
+                 num_keypoints=num_keypoints,
+                 img_hw=img_hw)
+        return
+
+    for epoch in range(start_epoch, epochs):
+        # train
+        train_sampler.set_epoch(epoch)  # shuffle training data
+        train_one_epoch(model=model,
+                        epoch=epoch,
+                        train_loader=train_loader,
+                        device=device,
+                        optimizer=optimizer,
+                        lr_scheduler=lr_scheduler,
+                        tb_writer=tb_writer,
+                        num_keypoints=num_keypoints,
+                        img_hw=img_hw)
+
+        # eval
+        if epoch % eval_freq == 0 or epoch == args.epochs - 1:
+            evaluate(model=model,
+                     epoch=epoch,
+                     val_loader=val_loader,
+                     device=device,
+                     tb_writer=tb_writer,
+                     affine_points_torch_func=transforms.affine_points_torch,
+                     num_keypoints=num_keypoints,
+                     img_hw=img_hw)
+
+        # save weights
+        if is_main_process() and (epoch % save_freq == 0 or epoch == args.epochs - 1):
+            save_files = {
+                'model': model.module.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'lr_scheduler': lr_scheduler.state_dict(),
+                'epoch': epoch
+            }
+            torch.save(save_files, os.path.join(save_weights_dir, f"model_weights_{epoch}.pth"))
+
+
+if __name__ == '__main__':
+    args = get_args_parser().parse_args()
+    main(args)
diff --git a/pytorch_keypoint/DeepPose/train_utils/distributed_utils.py b/pytorch_keypoint/DeepPose/train_utils/distributed_utils.py
new file mode 100644
index 000000000..ef3cdef66
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/train_utils/distributed_utils.py
@@ -0,0 +1,95 @@
+import os
+
+import torch
+import torch.distributed as dist
+
+
+def reduce_value(input_value: torch.Tensor, average=True) -> torch.Tensor:
+    """
+    Args:
+        input_value (Tensor): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values from all processes so that all processes
+    have the averaged results.
+    """
+    world_size = get_world_size()
+    if world_size < 2:  # 单GPU的情况
+        return input_value
+
+    with torch.inference_mode():  # 多GPU的情况
+        dist.all_reduce(input_value)
+        if average:
+            input_value /= world_size
+
+        return input_value
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    """检查是否支持分布式环境"""
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def init_distributed_mode(args):
+    if not torch.cuda.is_available():
+        print('No available device')
+        args.distributed = False
+        return
+
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print(f'| distributed init (rank {args.rank}): {args.dist_url}', flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend,
+                                         init_method=args.dist_url,
+                                         world_size=args.world_size,
+                                         rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
diff --git a/pytorch_keypoint/DeepPose/train_utils/losses.py b/pytorch_keypoint/DeepPose/train_utils/losses.py
new file mode 100644
index 000000000..163a93af2
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/train_utils/losses.py
@@ -0,0 +1,128 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class L1Loss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, pred: torch.Tensor, label: torch.Tensor, mask: torch = None) -> torch.Tensor:
+        """
+        Args:
+            pred [N, K, 2]
+            label [N, K, 2]
+            mask [N, K]
+        """
+        losses = F.l1_loss(pred, label, reduction="none")
+        if mask is not None:
+            # filter invalid keypoints(e.g. out of range)
+            losses = losses * mask.unsqueeze(2)
+
+        return torch.mean(torch.sum(losses, dim=(1, 2)), dim=0)
+
+
+class SmoothL1Loss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, pred: torch.Tensor, label: torch.Tensor, mask: torch = None) -> torch.Tensor:
+        """
+        Args:
+            pred [N, K, 2]
+            label [N, K, 2]
+            mask [N, K]
+        """
+        losses = F.smooth_l1_loss(pred, label, reduction="none")
+        if mask is not None:
+            # filter invalid keypoints(e.g. out of range)
+            losses = losses * mask.unsqueeze(2)
+
+        return torch.mean(torch.sum(losses, dim=(1, 2)), dim=0)
+
+
+class L2Loss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, pred: torch.Tensor, label: torch.Tensor, mask: torch = None) -> torch.Tensor:
+        """
+        Args:
+            pred [N, K, 2]
+            label [N, K, 2]
+            mask [N, K]
+        """
+        losses = F.mse_loss(pred, label, reduction="none")
+        if mask is not None:
+            # filter invalid keypoints(e.g. out of range)
+            losses = losses * mask.unsqueeze(2)
+
+        return torch.mean(torch.sum(losses, dim=(1, 2)), dim=0)
+
+
+class WingLoss(nn.Module):
+    """refer https://github.com/TropComplique/wing-loss/blob/master/loss.py
+    """
+    def __init__(self, w: float = 10.0, epsilon: float = 2.0) -> None:
+        super().__init__()
+        self.w = w
+        self.epsilon = epsilon
+        self.C = w * (1.0 - math.log(1.0 + w / epsilon))
+
+    def forward(self,
+                pred: torch.Tensor,
+                label: torch.Tensor,
+                wh_tensor: torch.Tensor,
+                mask: torch = None) -> torch.Tensor:
+        """
+        Args:
+            pred [N, K, 2]
+            wh_tensor [1, 1, 2]
+            label [N, K, 2]
+            mask [N, K]
+        """
+        delta = (pred - label).abs() * wh_tensor  # rel to abs
+        losses = torch.where(condition=self.w > delta,
+                             input=self.w * torch.log(1.0 + delta / self.epsilon),
+                             other=delta - self.C)
+        if mask is not None:
+            # filter invalid keypoints(e.g. out of range)
+            losses = losses * mask.unsqueeze(2)
+
+        return torch.mean(torch.sum(losses, dim=(1, 2)), dim=0)
+
+
+class SoftWingLoss(nn.Module):
+    """refer mmpose/models/losses/regression_loss.py
+    """
+    def __init__(self, omega1: float = 2.0, omega2: float = 20.0, epsilon: float = 0.5) -> None:
+        super().__init__()
+        self.omega1 = omega1
+        self.omega2 = omega2
+        self.epsilon = epsilon
+        self.B = omega1 - omega2 * math.log(1.0 + omega1 / epsilon)
+
+    def forward(self,
+                pred: torch.Tensor,
+                label: torch.Tensor,
+                wh_tensor: torch.Tensor,
+                mask: torch = None) -> torch.Tensor:
+        """
+        Args:
+            pred [N, K, 2]
+            label [N, K, 2]
+            wh_tensor [1, 1, 2]
+            mask [N, K]
+        """
+        delta = (pred - label).abs() * wh_tensor  # rel to abs
+        losses = torch.where(condition=delta < self.omega1,
+                             input=delta,
+                             other=self.omega2 * torch.log(1.0 + delta / self.epsilon) + self.B)
+        if mask is not None:
+            # filter invalid keypoints(e.g. out of range)
+            losses = losses * mask.unsqueeze(2)
+
+        loss = torch.mean(torch.sum(losses, dim=(1, 2)), dim=0)
+        return loss
diff --git a/pytorch_keypoint/DeepPose/train_utils/metrics.py b/pytorch_keypoint/DeepPose/train_utils/metrics.py
new file mode 100644
index 000000000..b0f0c7ce3
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/train_utils/metrics.py
@@ -0,0 +1,61 @@
+import torch
+
+from .distributed_utils import reduce_value, is_dist_avail_and_initialized
+
+
+class NMEMetric:
+    def __init__(self, device: torch.device) -> None:
+        # 两眼外角点对应keypoint索引
+        self.keypoint_idxs = [60, 72]
+        self.nme_accumulator: float = 0.
+        self.counter: float = 0.
+        self.device = device
+
+    def update(self, pred: torch.Tensor, gt: torch.Tensor, mask: torch.Tensor = None):
+        """
+        Args:
+            pred (shape [N, K, 2]): pred keypoints
+            gt (shape [N, K, 2]): gt keypoints
+            mask (shape [N, K]): valid keypoints mask
+        """
+        # ion: inter-ocular distance normalized error
+        ion = torch.linalg.norm(gt[:, self.keypoint_idxs[0]] - gt[:, self.keypoint_idxs[1]], dim=1)
+
+        valid_ion_mask = ion > 0
+        if mask is None:
+            mask = valid_ion_mask
+        else:
+            mask = torch.logical_and(mask, valid_ion_mask.unsqueeze_(dim=1)).sum(dim=1) > 0
+        num_valid = mask.sum().item()
+
+        # equal: (pred - gt).pow(2).sum(dim=2).pow(0.5).mean(dim=1)
+        l2_dis = torch.linalg.norm(pred - gt, dim=2)[mask].mean(dim=1)  # [N]
+
+        # avoid divide by zero
+        ion = ion[mask]  # [N]
+
+        self.nme_accumulator += l2_dis.div(ion).sum().item()
+        self.counter += num_valid
+
+    def evaluate(self):
+        return self.nme_accumulator / self.counter
+
+    def synchronize_results(self):
+        if is_dist_avail_and_initialized():
+            self.nme_accumulator = reduce_value(
+                torch.as_tensor(self.nme_accumulator, device=self.device),
+                average=False
+            ).item()
+
+            self.counter = reduce_value(
+                torch.as_tensor(self.counter, device=self.device),
+                average=False
+            )
+
+
+if __name__ == '__main__':
+    metric = NMEMetric()
+    metric.update(pred=torch.randn(32, 98, 2),
+                  gt=torch.randn(32, 98, 2),
+                  mask=torch.randn(32, 98))
+    print(metric.evaluate())
diff --git a/pytorch_keypoint/DeepPose/train_utils/train_eval_utils.py b/pytorch_keypoint/DeepPose/train_utils/train_eval_utils.py
new file mode 100644
index 000000000..bba484af5
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/train_utils/train_eval_utils.py
@@ -0,0 +1,92 @@
+import sys
+import math
+from typing import Callable, List
+
+from tqdm import tqdm
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+from .losses import WingLoss
+from .metrics import NMEMetric
+from .distributed_utils import is_main_process, reduce_value
+
+
+def train_one_epoch(model: torch.nn.Module,
+                    epoch: int,
+                    train_loader: DataLoader,
+                    device: torch.device,
+                    optimizer: torch.optim.Optimizer,
+                    lr_scheduler: torch.optim.lr_scheduler.LRScheduler,
+                    tb_writer: SummaryWriter,
+                    num_keypoints: int,
+                    img_hw: List[int]) -> None:
+    # define loss function
+    loss_func = WingLoss()
+    wh_tensor = torch.as_tensor(img_hw[::-1], dtype=torch.float32, device=device).reshape([1, 1, 2])
+
+    model.train()
+    train_bar = train_loader
+    if is_main_process():
+        train_bar = tqdm(train_loader, file=sys.stdout)
+
+    for step, (imgs, targets) in enumerate(train_bar):
+        imgs = imgs.to(device)
+        labels = targets["keypoints"].to(device)
+
+        optimizer.zero_grad()
+        # use mixed precision to speed up training
+        with torch.autocast(device_type=device.type):
+            pred: torch.Tensor = model(imgs)
+            loss: torch.Tensor = loss_func(pred.reshape((-1, num_keypoints, 2)), labels, wh_tensor)
+
+        loss_value = reduce_value(loss).item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+
+        loss.backward()
+        optimizer.step()
+        lr_scheduler.step()
+
+        if is_main_process():
+            train_bar.desc = f"train epoch[{epoch}] loss:{loss_value:.3f}"
+
+            global_step = epoch * len(train_loader) + step
+            tb_writer.add_scalar("train loss", loss.item(), global_step=global_step)
+            tb_writer.add_scalar("learning rate", optimizer.param_groups[0]["lr"], global_step=global_step)
+
+
+@torch.inference_mode()
+def evaluate(model: torch.nn.Module,
+             epoch: int,
+             val_loader: DataLoader,
+             device: torch.device,
+             tb_writer: SummaryWriter,
+             affine_points_torch_func: Callable,
+             num_keypoints: int,
+             img_hw: List[int]) -> None:
+    model.eval()
+    metric = NMEMetric(device=device)
+    wh_tensor = torch.as_tensor(img_hw[::-1], dtype=torch.float32, device=device).reshape([1, 1, 2])
+    eval_bar = val_loader
+    if is_main_process():
+        eval_bar = tqdm(val_loader, file=sys.stdout, desc="evaluation")
+
+    for step, (imgs, targets) in enumerate(eval_bar):
+        imgs = imgs.to(device)
+        m_invs = targets["m_invs"].to(device)
+        labels = targets["ori_keypoints"].to(device)
+
+        pred = model(imgs)
+        pred = pred.reshape((-1, num_keypoints, 2))  # [N, K, 2]
+        pred = pred * wh_tensor  # rel coord to abs coord
+        pred = affine_points_torch_func(pred, m_invs)
+
+        metric.update(pred, labels)
+
+    metric.synchronize_results()
+    if is_main_process():
+        nme = metric.evaluate()
+        tb_writer.add_scalar("evaluation nme", nme, global_step=epoch)
+        print(f"evaluation NME[{epoch}]: {nme:.3f}")
diff --git a/pytorch_keypoint/DeepPose/transforms.py b/pytorch_keypoint/DeepPose/transforms.py
new file mode 100644
index 000000000..ea55d25fb
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/transforms.py
@@ -0,0 +1,217 @@
+import math
+import random
+from typing import Tuple
+
+import cv2
+import torch
+import numpy as np
+
+from wflw_horizontal_flip_indices import wflw_flip_indices_dict
+
+
+def adjust_box(xmin: int, ymin: int, xmax: int, ymax: int, fixed_size: Tuple[int, int]):
+    """通过增加w或者h的方式保证输入图片的长宽比固定"""
+    w = xmax - xmin
+    h = ymax - ymin
+
+    hw_ratio = fixed_size[0] / fixed_size[1]
+    if h / w > hw_ratio:
+        # 需要在w方向padding
+        wi = h / hw_ratio
+        pad_w = (wi - w) / 2
+        xmin = xmin - pad_w
+        xmax = xmax + pad_w
+    else:
+        # 需要在h方向padding
+        hi = w * hw_ratio
+        pad_h = (hi - h) / 2
+        ymin = ymin - pad_h
+        ymax = ymax + pad_h
+
+    return xmin, ymin, xmax, ymax
+
+
+def affine_points_np(keypoint: np.ndarray, m: np.ndarray) -> np.ndarray:
+    """
+    Args:
+        keypoint [k, 2]
+        m [2, 3]
+    """
+    ones = np.ones((keypoint.shape[0], 1), dtype=np.float32)
+    keypoint = np.concatenate([keypoint, ones], axis=1)  # [k, 3]
+    new_keypoint = np.matmul(keypoint, m.T)
+    return new_keypoint
+
+
+def affine_points_torch(keypoint: torch.Tensor, m: torch.Tensor) -> torch.Tensor:
+    """
+    Args:
+        keypoint [n, k, 2]
+        m [n, 2, 3]
+    """
+    dtype = keypoint.dtype
+    device = keypoint.device
+
+    n, k, _ = keypoint.shape
+    ones = torch.ones(size=(n, k, 1), dtype=dtype, device=device)
+    keypoint = torch.concat([keypoint, ones], dim=2)  # [n, k, 3]
+    new_keypoint = torch.matmul(keypoint, m.transpose(1, 2))
+    return new_keypoint
+
+
+class Compose(object):
+    """组合多个transform函数"""
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+
+class Resize(object):
+    def __init__(self, h: int, w: int):
+        self.h = h
+        self.w = w
+
+    def __call__(self, image: np.ndarray, target):
+        image = cv2.resize(image, dsize=(self.w, self.h), fx=0, fy=0,
+                           interpolation=cv2.INTER_LINEAR)
+
+        return image, target
+
+
+class ToTensor(object):
+    """将opencv图像转为Tensor, HWC2CHW, 并缩放数值至0~1"""
+    def __call__(self, image, target):
+        image = torch.from_numpy(image).permute((2, 0, 1))
+        image = image.to(torch.float32) / 255.
+
+        if "ori_keypoint" in target and "keypoint" in target:
+            target["ori_keypoint"] = torch.from_numpy(target["ori_keypoint"])
+            target["keypoint"] = torch.from_numpy(target["keypoint"])
+        target["m_inv"] = torch.from_numpy(target["m_inv"])
+        return image, target
+
+
+class Normalize(object):
+    def __init__(self, mean=None, std=None):
+        self.mean = torch.as_tensor(mean, dtype=torch.float32).reshape((3, 1, 1))
+        self.std = torch.as_tensor(std, dtype=torch.float32).reshape((3, 1, 1))
+
+    def __call__(self, image: torch.Tensor, target: dict):
+        image.sub_(self.mean).div_(self.std)
+
+        if "keypoint" in target:
+            _, h, w = image.shape
+            keypoint = target["keypoint"]
+            keypoint[:, 0] /= w
+            keypoint[:, 1] /= h
+            target["keypoint"] = keypoint
+        return image, target
+
+
+class RandomHorizontalFlip(object):
+    """随机对输入图片进行水平翻转"""
+    def __init__(self, p: float = 0.5):
+        self.p = p
+        self.wflw_flip_ids = list(wflw_flip_indices_dict.values())
+
+    def __call__(self, image: np.ndarray, target: dict):
+        if random.random() < self.p:
+            # [h, w, c]
+            image = np.ascontiguousarray(np.flip(image, axis=[1]))
+
+            # [k, 2]
+            if "keypoint" in target:
+                _, w, _ = image.shape
+                keypoint: torch.Tensor = target["keypoint"]
+                keypoint = keypoint[self.wflw_flip_ids]
+                keypoint[:, 0] = w - keypoint[:, 0]
+                target["keypoint"] = keypoint
+
+        return image, target
+
+
+class AffineTransform(object):
+    """shift+scale+rotation"""
+    def __init__(self,
+                 scale_factor: Tuple[float, float] = (0.65, 1.35),
+                 scale_prob: float = 1.,
+                 rotate: int = 45,
+                 rotate_prob: float = 0.6,
+                 shift_factor: float = 0.15,
+                 shift_prob: float = 0.3,
+                 fixed_size: Tuple[int, int] = (256, 256)):
+        self.scale_factor = scale_factor
+        self.scale_prob = scale_prob
+        self.rotate = rotate
+        self.rotate_prob = rotate_prob
+        self.shift_factor = shift_factor
+        self.shift_prob = shift_prob
+        self.fixed_size = fixed_size  # (h, w)
+
+    def __call__(self, img: np.ndarray, target: dict):
+        src_xmin, src_ymin, src_xmax, src_ymax = adjust_box(*target["box"], fixed_size=self.fixed_size)
+        src_w = src_xmax - src_xmin
+        src_h = src_ymax - src_ymin
+
+        if random.random() < self.shift_prob:
+            shift_w_factor = random.uniform(-self.shift_factor, self.shift_factor)
+            shift_h_factor = random.uniform(-self.shift_factor, self.shift_factor)
+            src_xmin -= int(src_w * shift_w_factor)
+            src_xmax -= int(src_w * shift_w_factor)
+            src_ymin -= int(src_h * shift_h_factor)
+            src_ymax -= int(src_h * shift_h_factor)
+
+        src_center = np.array([(src_xmin + src_xmax) / 2, (src_ymin + src_ymax) / 2], dtype=np.float32)
+        src_p2 = src_center + np.array([0, -src_h / 2], dtype=np.float32)  # top middle
+        src_p3 = src_center + np.array([src_w / 2, 0], dtype=np.float32)   # right middle
+
+        dst_center = np.array([(self.fixed_size[1] - 1) / 2, (self.fixed_size[0] - 1) / 2], dtype=np.float32)
+        dst_p2 = np.array([(self.fixed_size[1] - 1) / 2, 0], dtype=np.float32)  # top middle
+        dst_p3 = np.array([self.fixed_size[1] - 1, (self.fixed_size[0] - 1) / 2], dtype=np.float32)  # right middle
+
+        if random.random() < self.scale_prob:
+            scale = random.uniform(*self.scale_factor)
+            src_w = src_w * scale
+            src_h = src_h * scale
+            src_p2 = src_center + np.array([0, -src_h / 2], dtype=np.float32)  # top middle
+            src_p3 = src_center + np.array([src_w / 2, 0], dtype=np.float32)   # right middle
+
+        if random.random() < self.rotate_prob:
+            angle = random.randint(-self.rotate, self.rotate)  # 角度制
+            angle = angle / 180 * math.pi  # 弧度制
+            src_p2 = src_center + np.array([src_h / 2 * math.sin(angle),
+                                            -src_h / 2 * math.cos(angle)], dtype=np.float32)
+            src_p3 = src_center + np.array([src_w / 2 * math.cos(angle),
+                                            src_w / 2 * math.sin(angle)], dtype=np.float32)
+
+        src = np.stack([src_center, src_p2, src_p3])
+        dst = np.stack([dst_center, dst_p2, dst_p3])
+
+        m = cv2.getAffineTransform(src, dst).astype(np.float32)  # 计算正向仿射变换矩阵
+        m_inv = cv2.getAffineTransform(dst, src).astype(np.float32)  # 计算逆向仿射变换矩阵，方便后续还原
+
+        # 对图像进行仿射变换
+        warp_img = cv2.warpAffine(src=img,
+                                  M=m,
+                                  dsize=tuple(self.fixed_size[::-1]),  # [w, h]
+                                  borderMode=cv2.BORDER_CONSTANT,
+                                  borderValue=(0, 0, 0),
+                                  flags=cv2.INTER_LINEAR)
+
+        if "keypoint" in target:
+            keypoint = target["keypoint"]
+            keypoint = affine_points_np(keypoint, m)
+            target["keypoint"] = keypoint
+
+        # from utils import draw_keypoints
+        # keypoint[:, 0] /= self.fixed_size[1]
+        # keypoint[:, 1] /= self.fixed_size[0]
+        # draw_keypoints(warp_img, keypoint, "affine.jpg", 2, is_rel=True)
+
+        target["m"] = m
+        target["m_inv"] = m_inv
+        return warp_img, target
diff --git a/pytorch_keypoint/DeepPose/utils.py b/pytorch_keypoint/DeepPose/utils.py
new file mode 100644
index 000000000..e848022c0
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/utils.py
@@ -0,0 +1,17 @@
+import cv2
+import numpy as np
+
+
+def draw_keypoints(img: np.ndarray, coordinate: np.ndarray, save_path: str, radius: int = 3, is_rel: bool = False):
+    coordinate_ = coordinate.copy()
+    if is_rel:
+        h, w, c = img.shape
+        coordinate_[:, 0] *= w
+        coordinate_[:, 1] *= h
+    coordinate_ = coordinate_.astype(np.int64).tolist()
+
+    img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    for x, y in coordinate_:
+        cv2.circle(img_bgr, center=(x, y), radius=radius, color=(255, 0, 0), thickness=-1)
+
+    cv2.imwrite(save_path, img_bgr)
diff --git a/pytorch_keypoint/DeepPose/wflw_horizontal_flip_indices.py b/pytorch_keypoint/DeepPose/wflw_horizontal_flip_indices.py
new file mode 100644
index 000000000..de0151319
--- /dev/null
+++ b/pytorch_keypoint/DeepPose/wflw_horizontal_flip_indices.py
@@ -0,0 +1,100 @@
+wflw_flip_indices_dict = {
+    0: 32,
+    1: 31,
+    2: 30,
+    3: 29,
+    4: 28,
+    5: 27,
+    6: 26,
+    7: 25,
+    8: 24,
+    9: 23,
+    10: 22,
+    11: 21,
+    12: 20,
+    13: 19,
+    14: 18,
+    15: 17,
+    16: 16,
+    17: 15,
+    18: 14,
+    19: 13,
+    20: 12,
+    21: 11,
+    22: 10,
+    23: 9,
+    24: 8,
+    25: 7,
+    26: 6,
+    27: 5,
+    28: 4,
+    29: 3,
+    30: 2,
+    31: 1,
+    32: 0,
+    33: 46,
+    34: 45,
+    35: 44,
+    36: 43,
+    37: 42,
+    38: 50,
+    39: 49,
+    40: 48,
+    41: 47,
+    42: 37,
+    43: 36,
+    44: 35,
+    45: 34,
+    46: 33,
+    47: 41,
+    48: 40,
+    49: 39,
+    50: 38,
+    51: 51,
+    52: 52,
+    53: 53,
+    54: 54,
+    55: 59,
+    56: 58,
+    57: 57,
+    58: 56,
+    59: 55,
+    60: 72,
+    61: 71,
+    62: 70,
+    63: 69,
+    64: 68,
+    65: 75,
+    66: 74,
+    67: 73,
+    68: 64,
+    69: 63,
+    70: 62,
+    71: 61,
+    72: 60,
+    73: 67,
+    74: 66,
+    75: 65,
+    76: 82,
+    77: 81,
+    78: 80,
+    79: 79,
+    80: 78,
+    81: 77,
+    82: 76,
+    83: 87,
+    84: 86,
+    85: 85,
+    86: 84,
+    87: 83,
+    88: 92,
+    89: 91,
+    90: 90,
+    91: 89,
+    92: 88,
+    93: 95,
+    94: 94,
+    95: 93,
+    96: 97,
+    97: 96,
+}
diff --git a/pytorch_keypoint/HRNet/HRNet.png b/pytorch_keypoint/HRNet/HRNet.png
new file mode 100644
index 000000000..96e83b8b5
Binary files /dev/null and b/pytorch_keypoint/HRNet/HRNet.png differ
diff --git a/pytorch_keypoint/HRNet/README.md b/pytorch_keypoint/HRNet/README.md
new file mode 100644
index 000000000..509f1097f
--- /dev/null
+++ b/pytorch_keypoint/HRNet/README.md
@@ -0,0 +1,105 @@
+# HRNet
+
+## 该项目主要参考以下仓库
+* https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+* https://github.com/stefanopini/simple-HRNet
+
+## 环境配置：
+* Python3.6/3.7/3.8
+* Pytorch1.10或以上
+* pycocotools(Linux:`pip install pycocotools`; Windows:`pip install pycocotools-windows`(不需要额外安装vs))
+* Ubuntu或Centos(不建议Windows)
+* 最好使用GPU训练
+* 详细环境配置见`requirements.txt`
+
+## 文件结构：
+```
+  ├── model: 搭建HRNet相关代码
+  ├── train_utils: 训练验证相关模块（包括coco验证相关）
+  ├── my_dataset_coco.py: 自定义dataset用于读取COCO2017数据集
+  ├── person_keypoints.json: COCO数据集中人体关键点相关信息
+  ├── train.py: 单GPU/CPU训练脚本
+  ├── train_multi_GPU.py: 针对使用多GPU的用户使用
+  ├── predict.py: 简易的预测脚本，使用训练好的权重进行预测
+  ├── validation.py: 利用训练好的权重验证/测试数据的COCO指标，并生成record_mAP.txt文件
+  └── transforms.py: 数据增强相关
+```
+
+## 预训练权重下载地址（下载后放入当前文件夹中）：
+由于原作者提供的预训练权重(Imagenet和COCO)是放在GoogleDrive和OneDrive上的，国内无法正常访问。所有我提前将权重文件全部下载并放在百度网盘中，
+需要的可以自行下载，链接:https://pan.baidu.com/s/1Lu6mMAWfm_8GGykttFMpVw 提取码:f43o
+
+下载后的目录结构如下：
+```
+├── pytorch
+      ├── pose_mpii
+      ├── pose_coco
+      │     ├── pose_resnet_50_384x288.pth
+      │     ├── pose_resnet_50_256x192.pth
+      │     ├── pose_resnet_101_384x288.pth
+      │     ├── pose_resnet_101_256x192.pth
+      │     ├── pose_hrnet_w32_384x288.pth
+      │     └── pose_hrnet_w32_256x192.pth
+      └── imagenet
+            ├── resnet50-19c8e357.pth
+            ├── resnet152-b121ed2d.pth
+            ├── resnet101-5d3b4d8f.pth
+            └── hrnet_w32-36af842e.pth
+```
+如果要直接使用在COCO数据集上预训练好的权重进行预测，下载pose_coco下的`pose_hrnet_w32_256x192.pth`使用即可。
+如果要从头训练网络，下载imagenet下的`hrnet_w32-36af842e.pth`文件，并重命名为`hrnet_w32.pth`即可。
+
+除此之外，还有一个`person_detection_results`文件，存储的是论文中提到的人体检测器的检测结果，如果需要使用可以下载，但个人建议直接使用COCO val中GT信息即可。
+链接: https://pan.baidu.com/s/19Z4mmNHUD934GQ9QYcF5iw  密码: i08q
+ 
+## 数据集，本例程使用的是COCO2017数据集
+* COCO官网地址：https://cocodataset.org/
+* 对数据集不了解的可以看下我写的博文：https://blog.csdn.net/qq_37541097/article/details/113247318
+* 这里以下载coco2017数据集为例，主要下载三个文件：
+    * `2017 Train images [118K/18GB]`：训练过程中使用到的所有图像文件
+    * `2017 Val images [5K/1GB]`：验证过程中使用到的所有图像文件
+    * `2017 Train/Val annotations [241MB]`：对应训练集和验证集的标注json文件
+* 都解压到`coco2017`文件夹下，可得到如下文件夹结构：
+```
+├── coco2017: 数据集根目录
+     ├── train2017: 所有训练图像文件夹(118287张)
+     ├── val2017: 所有验证图像文件夹(5000张)
+     └── annotations: 对应标注文件夹
+              ├── instances_train2017.json: 对应目标检测、分割任务的训练集标注文件
+              ├── instances_val2017.json: 对应目标检测、分割任务的验证集标注文件
+              ├── captions_train2017.json: 对应图像描述的训练集标注文件
+              ├── captions_val2017.json: 对应图像描述的验证集标注文件
+              ├── person_keypoints_train2017.json: 对应人体关键点检测的训练集标注文件
+              └── person_keypoints_val2017.json: 对应人体关键点检测的验证集标注文件夹
+```
+
+## 训练方法
+* 注：该项目从头训练HRNet在MS COCO2017的val上的mAP[@0.50:0.95]为76.1，利用原作者提供的权重在val上的mAP[@0.50:0.95]为76.6，相差0.5个点，
+暂时没有找到原因。由于训练该网络需要迭代210个epoch(按照论文中的数据)，训练时间很长，建议直接使用原作者提供训练好的权重。另外，在训练过程中发现GPU的利用率
+并不高(在20%~60%之间浮动)，暂时猜测是网络结构的原因。
+* 确保提前准备好数据集
+* 确保提前下载好对应预训练模型权重
+* 确保设置好`--num-joints`(对于人体检测的关键点个数，COCO是17个点)、`--fixed-size`(输入目标图像的高宽，默认[256, 192])和`--data-path`(指向`coco2017`目录)
+* 若要使用单GPU训练直接使用train.py训练脚本
+* 若要使用多GPU训练，使用`torchrun --nproc_per_node=8 train_multi_GPU.py`指令,`nproc_per_node`参数为使用GPU数量
+* 如果想指定使用哪些GPU设备可在指令前加上`CUDA_VISIBLE_DEVICES=0,3`(例如我只要使用设备中的第1块和第4块GPU设备)
+* `CUDA_VISIBLE_DEVICES=0,3 torchrun --nproc_per_node=2 train_multi_GPU.py`
+
+## 注意事项
+1. 在使用训练脚本时，注意要将`--data-path`设置为自己存放数据集的**根目录**：
+假设要使用COCO数据集，启用自定义数据集读取CocoDetection并将数据集解压到成/data/coco2017目录下
+```
+python train.py --data-path /data/coco2017
+```
+2. 训练过程中保存的`results.txt`是每个epoch在验证集上的COCO指标，前10个值是COCO指标，后面两个值是训练平均损失以及学习率
+3. 在使用预测脚本时，如果要读取自己训练好的权重要将`weights_path`设置为你自己生成的权重路径。
+
+
+## 如果对HRNet网络不是很理解可参考我的bilibili
+https://www.bilibili.com/video/BV1bB4y1y7qP
+
+## 进一步了解该项目，以及对HRNet代码的分析可参考我的bilibili
+https://www.bilibili.com/video/BV1ar4y157JM
+
+## HRNet网络结构图
+![HRNet.png](HRNet.png)
diff --git a/pytorch_keypoint/HRNet/draw_utils.py b/pytorch_keypoint/HRNet/draw_utils.py
new file mode 100644
index 000000000..dbaddd579
--- /dev/null
+++ b/pytorch_keypoint/HRNet/draw_utils.py
@@ -0,0 +1,58 @@
+import numpy as np
+from numpy import ndarray
+import PIL
+from PIL import ImageDraw, ImageFont
+from PIL.Image import Image
+
+# COCO 17 points
+point_name = ["nose", "left_eye", "right_eye",
+              "left_ear", "right_ear",
+              "left_shoulder", "right_shoulder",
+              "left_elbow", "right_elbow",
+              "left_wrist", "right_wrist",
+              "left_hip", "right_hip",
+              "left_knee", "right_knee",
+              "left_ankle", "right_ankle"]
+
+point_color = [(240, 2, 127), (240, 2, 127), (240, 2, 127),
+               (240, 2, 127), (240, 2, 127),
+               (255, 255, 51), (255, 255, 51),
+               (254, 153, 41), (44, 127, 184),
+               (217, 95, 14), (0, 0, 255),
+               (255, 255, 51), (255, 255, 51), (228, 26, 28),
+               (49, 163, 84), (252, 176, 243), (0, 176, 240),
+               (255, 255, 0), (169, 209, 142),
+               (255, 255, 0), (169, 209, 142),
+               (255, 255, 0), (169, 209, 142)]
+
+
+def draw_keypoints(img: Image,
+                   keypoints: ndarray,
+                   scores: ndarray = None,
+                   thresh: float = 0.2,
+                   r: int = 2,
+                   draw_text: bool = False,
+                   font: str = 'arial.ttf',
+                   font_size: int = 10):
+    if isinstance(img, ndarray):
+        img = PIL.Image.fromarray(img)
+
+    if scores is None:
+        scores = np.ones(keypoints.shape[0])
+
+    if draw_text:
+        try:
+            font = ImageFont.truetype(font, font_size)
+        except IOError:
+            font = ImageFont.load_default()
+
+    draw = ImageDraw.Draw(img)
+    for i, (point, score) in enumerate(zip(keypoints, scores)):
+        if score > thresh and np.max(point) > 0:
+            draw.ellipse([point[0] - r, point[1] - r, point[0] + r, point[1] + r],
+                         fill=point_color[i],
+                         outline=(255, 255, 255))
+            if draw_text:
+                draw.text((point[0] + r, point[1] + r), text=point_name[i], font=font)
+
+    return img
diff --git a/pytorch_keypoint/HRNet/model/__init__.py b/pytorch_keypoint/HRNet/model/__init__.py
new file mode 100644
index 000000000..8db7aa786
--- /dev/null
+++ b/pytorch_keypoint/HRNet/model/__init__.py
@@ -0,0 +1 @@
+from .hrnet import HighResolutionNet
diff --git a/pytorch_keypoint/HRNet/model/hrnet.py b/pytorch_keypoint/HRNet/model/hrnet.py
new file mode 100644
index 000000000..4524aa693
--- /dev/null
+++ b/pytorch_keypoint/HRNet/model/hrnet.py
@@ -0,0 +1,278 @@
+import torch.nn as nn
+
+BN_MOMENTUM = 0.1
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class StageModule(nn.Module):
+    def __init__(self, input_branches, output_branches, c):
+        """
+        构建对应stage，即用来融合不同尺度的实现
+        :param input_branches: 输入的分支数，每个分支对应一种尺度
+        :param output_branches: 输出的分支数
+        :param c: 输入的第一个分支通道数
+        """
+        super().__init__()
+        self.input_branches = input_branches
+        self.output_branches = output_branches
+
+        self.branches = nn.ModuleList()
+        for i in range(self.input_branches):  # 每个分支上都先通过4个BasicBlock
+            w = c * (2 ** i)  # 对应第i个分支的通道数
+            branch = nn.Sequential(
+                BasicBlock(w, w),
+                BasicBlock(w, w),
+                BasicBlock(w, w),
+                BasicBlock(w, w)
+            )
+            self.branches.append(branch)
+
+        self.fuse_layers = nn.ModuleList()  # 用于融合每个分支上的输出
+        for i in range(self.output_branches):
+            self.fuse_layers.append(nn.ModuleList())
+            for j in range(self.input_branches):
+                if i == j:
+                    # 当输入、输出为同一个分支时不做任何处理
+                    self.fuse_layers[-1].append(nn.Identity())
+                elif i < j:
+                    # 当输入分支j大于输出分支i时(即输入分支下采样率大于输出分支下采样率)，
+                    # 此时需要对输入分支j进行通道调整以及上采样，方便后续相加
+                    self.fuse_layers[-1].append(
+                        nn.Sequential(
+                            nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=1, stride=1, bias=False),
+                            nn.BatchNorm2d(c * (2 ** i), momentum=BN_MOMENTUM),
+                            nn.Upsample(scale_factor=2.0 ** (j - i), mode='nearest')
+                        )
+                    )
+                else:  # i > j
+                    # 当输入分支j小于输出分支i时(即输入分支下采样率小于输出分支下采样率)，
+                    # 此时需要对输入分支j进行通道调整以及下采样，方便后续相加
+                    # 注意，这里每次下采样2x都是通过一个3x3卷积层实现的，4x就是两个，8x就是三个，总共i-j个
+                    ops = []
+                    # 前i-j-1个卷积层不用变通道，只进行下采样
+                    for k in range(i - j - 1):
+                        ops.append(
+                            nn.Sequential(
+                                nn.Conv2d(c * (2 ** j), c * (2 ** j), kernel_size=3, stride=2, padding=1, bias=False),
+                                nn.BatchNorm2d(c * (2 ** j), momentum=BN_MOMENTUM),
+                                nn.ReLU(inplace=True)
+                            )
+                        )
+                    # 最后一个卷积层不仅要调整通道，还要进行下采样
+                    ops.append(
+                        nn.Sequential(
+                            nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=3, stride=2, padding=1, bias=False),
+                            nn.BatchNorm2d(c * (2 ** i), momentum=BN_MOMENTUM)
+                        )
+                    )
+                    self.fuse_layers[-1].append(nn.Sequential(*ops))
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        # 每个分支通过对应的block
+        x = [branch(xi) for branch, xi in zip(self.branches, x)]
+
+        # 接着融合不同尺寸信息
+        x_fused = []
+        for i in range(len(self.fuse_layers)):
+            x_fused.append(
+                self.relu(
+                    sum([self.fuse_layers[i][j](x[j]) for j in range(len(self.branches))])
+                )
+            )
+
+        return x_fused
+
+
+class HighResolutionNet(nn.Module):
+    def __init__(self, base_channel: int = 32, num_joints: int = 17):
+        super().__init__()
+        # Stem
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+
+        # Stage1
+        downsample = nn.Sequential(
+            nn.Conv2d(64, 256, kernel_size=1, stride=1, bias=False),
+            nn.BatchNorm2d(256, momentum=BN_MOMENTUM)
+        )
+        self.layer1 = nn.Sequential(
+            Bottleneck(64, 64, downsample=downsample),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64)
+        )
+
+        self.transition1 = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(256, base_channel, kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(base_channel, momentum=BN_MOMENTUM),
+                nn.ReLU(inplace=True)
+            ),
+            nn.Sequential(
+                nn.Sequential(  # 这里又使用一次Sequential是为了适配原项目中提供的权重
+                    nn.Conv2d(256, base_channel * 2, kernel_size=3, stride=2, padding=1, bias=False),
+                    nn.BatchNorm2d(base_channel * 2, momentum=BN_MOMENTUM),
+                    nn.ReLU(inplace=True)
+                )
+            )
+        ])
+
+        # Stage2
+        self.stage2 = nn.Sequential(
+            StageModule(input_branches=2, output_branches=2, c=base_channel)
+        )
+
+        # transition2
+        self.transition2 = nn.ModuleList([
+            nn.Identity(),  # None,  - Used in place of "None" because it is callable
+            nn.Identity(),  # None,  - Used in place of "None" because it is callable
+            nn.Sequential(
+                nn.Sequential(
+                    nn.Conv2d(base_channel * 2, base_channel * 4, kernel_size=3, stride=2, padding=1, bias=False),
+                    nn.BatchNorm2d(base_channel * 4, momentum=BN_MOMENTUM),
+                    nn.ReLU(inplace=True)
+                )
+            )
+        ])
+
+        # Stage3
+        self.stage3 = nn.Sequential(
+            StageModule(input_branches=3, output_branches=3, c=base_channel),
+            StageModule(input_branches=3, output_branches=3, c=base_channel),
+            StageModule(input_branches=3, output_branches=3, c=base_channel),
+            StageModule(input_branches=3, output_branches=3, c=base_channel)
+        )
+
+        # transition3
+        self.transition3 = nn.ModuleList([
+            nn.Identity(),  # None,  - Used in place of "None" because it is callable
+            nn.Identity(),  # None,  - Used in place of "None" because it is callable
+            nn.Identity(),  # None,  - Used in place of "None" because it is callable
+            nn.Sequential(
+                nn.Sequential(
+                    nn.Conv2d(base_channel * 4, base_channel * 8, kernel_size=3, stride=2, padding=1, bias=False),
+                    nn.BatchNorm2d(base_channel * 8, momentum=BN_MOMENTUM),
+                    nn.ReLU(inplace=True)
+                )
+            )
+        ])
+
+        # Stage4
+        # 注意，最后一个StageModule只输出分辨率最高的特征层
+        self.stage4 = nn.Sequential(
+            StageModule(input_branches=4, output_branches=4, c=base_channel),
+            StageModule(input_branches=4, output_branches=4, c=base_channel),
+            StageModule(input_branches=4, output_branches=1, c=base_channel)
+        )
+
+        # Final layer
+        self.final_layer = nn.Conv2d(base_channel, num_joints, kernel_size=1, stride=1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.layer1(x)
+        x = [trans(x) for trans in self.transition1]  # Since now, x is a list
+
+        x = self.stage2(x)
+        x = [
+            self.transition2[0](x[0]),
+            self.transition2[1](x[1]),
+            self.transition2[2](x[-1])
+        ]  # New branch derives from the "upper" branch only
+
+        x = self.stage3(x)
+        x = [
+            self.transition3[0](x[0]),
+            self.transition3[1](x[1]),
+            self.transition3[2](x[2]),
+            self.transition3[3](x[-1]),
+        ]  # New branch derives from the "upper" branch only
+
+        x = self.stage4(x)
+
+        x = self.final_layer(x[0])
+
+        return x
diff --git a/pytorch_keypoint/HRNet/my_dataset_coco.py b/pytorch_keypoint/HRNet/my_dataset_coco.py
new file mode 100644
index 000000000..ff1cea78a
--- /dev/null
+++ b/pytorch_keypoint/HRNet/my_dataset_coco.py
@@ -0,0 +1,108 @@
+import os
+import copy
+
+import torch
+import numpy as np
+import cv2
+import torch.utils.data as data
+from pycocotools.coco import COCO
+
+
+class CocoKeypoint(data.Dataset):
+    def __init__(self,
+                 root,
+                 dataset="train",
+                 years="2017",
+                 transforms=None,
+                 det_json_path=None,
+                 fixed_size=(256, 192)):
+        super().__init__()
+        assert dataset in ["train", "val"], 'dataset must be in ["train", "val"]'
+        anno_file = f"person_keypoints_{dataset}{years}.json"
+        assert os.path.exists(root), "file '{}' does not exist.".format(root)
+        self.img_root = os.path.join(root, f"{dataset}{years}")
+        assert os.path.exists(self.img_root), "path '{}' does not exist.".format(self.img_root)
+        self.anno_path = os.path.join(root, "annotations", anno_file)
+        assert os.path.exists(self.anno_path), "file '{}' does not exist.".format(self.anno_path)
+
+        self.fixed_size = fixed_size
+        self.mode = dataset
+        self.transforms = transforms
+        self.coco = COCO(self.anno_path)
+        img_ids = list(sorted(self.coco.imgs.keys()))
+
+        if det_json_path is not None:
+            det = self.coco.loadRes(det_json_path)
+        else:
+            det = self.coco
+
+        self.valid_person_list = []
+        obj_idx = 0
+        for img_id in img_ids:
+            img_info = self.coco.loadImgs(img_id)[0]
+            ann_ids = det.getAnnIds(imgIds=img_id)
+            anns = det.loadAnns(ann_ids)
+            for ann in anns:
+                # only save person class
+                if ann["category_id"] != 1:
+                    print(f'warning: find not support id: {ann["category_id"]}, only support id: 1 (person)')
+                    continue
+
+                # COCO_val2017_detections_AP_H_56_person.json文件中只有det信息，没有keypoint信息，跳过检查
+                if det_json_path is None:
+                    # skip objs without keypoints annotation
+                    if "keypoints" not in ann:
+                        continue
+                    if max(ann["keypoints"]) == 0:
+                        continue
+
+                xmin, ymin, w, h = ann['bbox']
+                # Use only valid bounding boxes
+                if w > 0 and h > 0:
+                    info = {
+                        "box": [xmin, ymin, w, h],
+                        "image_path": os.path.join(self.img_root, img_info["file_name"]),
+                        "image_id": img_id,
+                        "image_width": img_info['width'],
+                        "image_height": img_info['height'],
+                        "obj_origin_hw": [h, w],
+                        "obj_index": obj_idx,
+                        "score": ann["score"] if "score" in ann else 1.
+                    }
+
+                    # COCO_val2017_detections_AP_H_56_person.json文件中只有det信息，没有keypoint信息，跳过
+                    if det_json_path is None:
+                        keypoints = np.array(ann["keypoints"]).reshape([-1, 3])
+                        visible = keypoints[:, 2]
+                        keypoints = keypoints[:, :2]
+                        info["keypoints"] = keypoints
+                        info["visible"] = visible
+
+                    self.valid_person_list.append(info)
+                    obj_idx += 1
+
+    def __getitem__(self, idx):
+        target = copy.deepcopy(self.valid_person_list[idx])
+
+        image = cv2.imread(target["image_path"])
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        if self.transforms is not None:
+            image, person_info = self.transforms(image, target)
+
+        return image, target
+
+    def __len__(self):
+        return len(self.valid_person_list)
+
+    @staticmethod
+    def collate_fn(batch):
+        imgs_tuple, targets_tuple = tuple(zip(*batch))
+        imgs_tensor = torch.stack(imgs_tuple)
+        return imgs_tensor, targets_tuple
+
+
+if __name__ == '__main__':
+    train = CocoKeypoint("/data/coco2017/", dataset="val")
+    print(len(train))
+    t = train[0]
+    print(t)
diff --git a/pytorch_keypoint/HRNet/person.png b/pytorch_keypoint/HRNet/person.png
new file mode 100644
index 000000000..f647848b5
Binary files /dev/null and b/pytorch_keypoint/HRNet/person.png differ
diff --git a/pytorch_keypoint/HRNet/person_keypoints.json b/pytorch_keypoint/HRNet/person_keypoints.json
new file mode 100644
index 000000000..ffdbc5bd8
--- /dev/null
+++ b/pytorch_keypoint/HRNet/person_keypoints.json
@@ -0,0 +1,8 @@
+{
+  "keypoints": ["nose","left_eye","right_eye","left_ear","right_ear","left_shoulder","right_shoulder","left_elbow","right_elbow","left_wrist","right_wrist","left_hip","right_hip","left_knee","right_knee","left_ankle","right_ankle"],
+  "skeleton": [[16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13],[6,7],[6,8],[7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]],
+  "flip_pairs": [[1,2], [3,4], [5,6], [7,8], [9,10], [11,12], [13,14], [15,16]],
+  "kps_weights": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.2, 1.2, 1.5, 1.5, 1.0, 1.0, 1.2, 1.2, 1.5, 1.5],
+  "upper_body_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+  "lower_body_ids": [11, 12, 13, 14, 15, 16]
+}
\ No newline at end of file
diff --git a/pytorch_keypoint/HRNet/plot_curve.py b/pytorch_keypoint/HRNet/plot_curve.py
new file mode 100644
index 000000000..188df710e
--- /dev/null
+++ b/pytorch_keypoint/HRNet/plot_curve.py
@@ -0,0 +1,46 @@
+import datetime
+import matplotlib.pyplot as plt
+
+
+def plot_loss_and_lr(train_loss, learning_rate):
+    try:
+        x = list(range(len(train_loss)))
+        fig, ax1 = plt.subplots(1, 1)
+        ax1.plot(x, train_loss, 'r', label='loss')
+        ax1.set_xlabel("step")
+        ax1.set_ylabel("loss")
+        ax1.set_title("Train Loss and lr")
+        plt.legend(loc='best')
+
+        ax2 = ax1.twinx()
+        ax2.plot(x, learning_rate, label='lr')
+        ax2.set_ylabel("learning rate")
+        ax2.set_xlim(0, len(train_loss))  # 设置横坐标整数间隔
+        plt.legend(loc='best')
+
+        handles1, labels1 = ax1.get_legend_handles_labels()
+        handles2, labels2 = ax2.get_legend_handles_labels()
+        plt.legend(handles1 + handles2, labels1 + labels2, loc='upper right')
+
+        fig.subplots_adjust(right=0.8)  # 防止出现保存图片显示不全的情况
+        fig.savefig('./loss_and_lr{}.png'.format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
+        plt.close()
+        print("successful save loss curve! ")
+    except Exception as e:
+        print(e)
+
+
+def plot_map(mAP):
+    try:
+        x = list(range(len(mAP)))
+        plt.plot(x, mAP, label='mAp')
+        plt.xlabel('epoch')
+        plt.ylabel('mAP')
+        plt.title('Eval mAP')
+        plt.xlim(0, len(mAP))
+        plt.legend(loc='best')
+        plt.savefig('./mAP.png')
+        plt.close()
+        print("successful save mAP curve!")
+    except Exception as e:
+        print(e)
diff --git a/pytorch_keypoint/HRNet/predict.py b/pytorch_keypoint/HRNet/predict.py
new file mode 100644
index 000000000..ffb46a24c
--- /dev/null
+++ b/pytorch_keypoint/HRNet/predict.py
@@ -0,0 +1,82 @@
+import os
+import json
+
+import torch
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+
+from model import HighResolutionNet
+from draw_utils import draw_keypoints
+import transforms
+
+
+def predict_all_person():
+    # TODO
+    pass
+
+
+def predict_single_person():
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print(f"using device: {device}")
+
+    flip_test = True
+    resize_hw = (256, 192)
+    img_path = "./person.png"
+    weights_path = "./pose_hrnet_w32_256x192.pth"
+    keypoint_json_path = "person_keypoints.json"
+    assert os.path.exists(img_path), f"file: {img_path} does not exist."
+    assert os.path.exists(weights_path), f"file: {weights_path} does not exist."
+    assert os.path.exists(keypoint_json_path), f"file: {keypoint_json_path} does not exist."
+
+    data_transform = transforms.Compose([
+        transforms.AffineTransform(scale=(1.25, 1.25), fixed_size=resize_hw),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+
+    # read json file
+    with open(keypoint_json_path, "r") as f:
+        person_info = json.load(f)
+
+    # read single-person image
+    img = cv2.imread(img_path)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img_tensor, target = data_transform(img, {"box": [0, 0, img.shape[1] - 1, img.shape[0] - 1]})
+    img_tensor = torch.unsqueeze(img_tensor, dim=0)
+
+    # create model
+    # HRNet-W32: base_channel=32
+    # HRNet-W48: base_channel=48
+    model = HighResolutionNet(base_channel=32)
+    weights = torch.load(weights_path, map_location=device)
+    weights = weights if "model" not in weights else weights["model"]
+    model.load_state_dict(weights)
+    model.to(device)
+    model.eval()
+
+    with torch.inference_mode():
+        outputs = model(img_tensor.to(device))
+
+        if flip_test:
+            flip_tensor = transforms.flip_images(img_tensor)
+            flip_outputs = torch.squeeze(
+                transforms.flip_back(model(flip_tensor.to(device)), person_info["flip_pairs"]),
+            )
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            # https://github.com/leoxiaobin/deep-high-resolution-net.pytorch/issues/22
+            flip_outputs[..., 1:] = flip_outputs.clone()[..., 0: -1]
+            outputs = (outputs + flip_outputs) * 0.5
+
+        keypoints, scores = transforms.get_final_preds(outputs, [target["reverse_trans"]], True)
+        keypoints = np.squeeze(keypoints)
+        scores = np.squeeze(scores)
+
+        plot_img = draw_keypoints(img, keypoints, scores, thresh=0.2, r=3)
+        plt.imshow(plot_img)
+        plt.show()
+        plot_img.save("test_result.jpg")
+
+
+if __name__ == '__main__':
+    predict_single_person()
diff --git a/pytorch_keypoint/HRNet/requirements.txt b/pytorch_keypoint/HRNet/requirements.txt
new file mode 100644
index 000000000..d57b6b410
--- /dev/null
+++ b/pytorch_keypoint/HRNet/requirements.txt
@@ -0,0 +1,8 @@
+numpy
+opencv_python==4.5.4.60
+lxml
+torch==1.10.1
+torchvision==0.11.1
+pycocotools
+matplotlib
+tqdm
\ No newline at end of file
diff --git a/pytorch_keypoint/HRNet/train.py b/pytorch_keypoint/HRNet/train.py
new file mode 100644
index 000000000..7b7fa31f6
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train.py
@@ -0,0 +1,229 @@
+import json
+import os
+import datetime
+
+import torch
+from torch.utils import data
+import numpy as np
+
+import transforms
+from model import HighResolutionNet
+from my_dataset_coco import CocoKeypoint
+from train_utils import train_eval_utils as utils
+
+
+def create_model(num_joints, load_pretrain_weights=True):
+    model = HighResolutionNet(base_channel=32, num_joints=num_joints)
+    
+    if load_pretrain_weights:
+        # 载入预训练模型权重
+        # 链接:https://pan.baidu.com/s/1Lu6mMAWfm_8GGykttFMpVw 提取码:f43o
+        weights_dict = torch.load("./hrnet_w32.pth", map_location='cpu')
+
+        for k in list(weights_dict.keys()):
+            # 如果载入的是imagenet权重，就删除无用权重
+            if ("head" in k) or ("fc" in k):
+                del weights_dict[k]
+
+            # 如果载入的是coco权重，对比下num_joints，如果不相等就删除
+            if "final_layer" in k:
+                if weights_dict[k].shape[0] != num_joints:
+                    del weights_dict[k]
+
+        missing_keys, unexpected_keys = model.load_state_dict(weights_dict, strict=False)
+        if len(missing_keys) != 0:
+            print("missing_keys: ", missing_keys)
+
+    return model
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    print("Using {} device training.".format(device.type))
+
+    # 用来保存coco_info的文件
+    results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
+
+    with open(args.keypoints_path, "r") as f:
+        person_kps_info = json.load(f)
+
+    fixed_size = args.fixed_size
+    heatmap_hw = (args.fixed_size[0] // 4, args.fixed_size[1] // 4)
+    kps_weights = np.array(person_kps_info["kps_weights"],
+                           dtype=np.float32).reshape((args.num_joints,))
+    data_transform = {
+        "train": transforms.Compose([
+            transforms.HalfBody(0.3, person_kps_info["upper_body_ids"], person_kps_info["lower_body_ids"]),
+            transforms.AffineTransform(scale=(0.65, 1.35), rotation=(-45, 45), fixed_size=fixed_size),
+            transforms.RandomHorizontalFlip(0.5, person_kps_info["flip_pairs"]),
+            transforms.KeypointToHeatMap(heatmap_hw=heatmap_hw, gaussian_sigma=2, keypoints_weights=kps_weights),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ]),
+        "val": transforms.Compose([
+            transforms.AffineTransform(scale=(1.25, 1.25), fixed_size=fixed_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+    }
+
+    data_root = args.data_path
+
+    # load train data set
+    # coco2017 -> annotations -> person_keypoints_train2017.json
+    train_dataset = CocoKeypoint(data_root, "train", transforms=data_transform["train"], fixed_size=args.fixed_size)
+
+    # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using %g dataloader workers' % nw)
+
+    train_data_loader = data.DataLoader(train_dataset,
+                                        batch_size=batch_size,
+                                        shuffle=True,
+                                        pin_memory=True,
+                                        num_workers=nw,
+                                        collate_fn=train_dataset.collate_fn)
+
+    # load validation data set
+    # coco2017 -> annotations -> person_keypoints_val2017.json
+    val_dataset = CocoKeypoint(data_root, "val", transforms=data_transform["val"], fixed_size=args.fixed_size,
+                               det_json_path=args.person_det)
+    val_data_loader = data.DataLoader(val_dataset,
+                                      batch_size=batch_size,
+                                      shuffle=False,
+                                      pin_memory=True,
+                                      num_workers=nw,
+                                      collate_fn=val_dataset.collate_fn)
+
+    # create model
+    model = create_model(num_joints=args.num_joints)
+    # print(model)
+
+    model.to(device)
+
+    # define optimizer
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.AdamW(params,
+                                  lr=args.lr,
+                                  weight_decay=args.weight_decay)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    # learning rate scheduler
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
+
+    # 如果指定了上次训练保存的权重文件地址，则接着上次结果接着训练
+    if args.resume != "":
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp and "scaler" in checkpoint:
+            scaler.load_state_dict(checkpoint["scaler"])
+        print("the training process from epoch{}...".format(args.start_epoch))
+
+    train_loss = []
+    learning_rate = []
+    val_map = []
+
+    for epoch in range(args.start_epoch, args.epochs):
+        # train for one epoch, printing every 50 iterations
+        mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
+                                              device=device, epoch=epoch,
+                                              print_freq=50, warmup=True,
+                                              scaler=scaler)
+        train_loss.append(mean_loss.item())
+        learning_rate.append(lr)
+
+        # update the learning rate
+        lr_scheduler.step()
+
+        # evaluate on the test dataset
+        coco_info = utils.evaluate(model, val_data_loader, device=device,
+                                   flip=True, flip_pairs=person_kps_info["flip_pairs"])
+
+        # write into txt
+        with open(results_file, "a") as f:
+            # 写入的数据包括coco指标还有loss和learning rate
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+            f.write(txt + "\n")
+
+        val_map.append(coco_info[1])  # @0.5 mAP
+
+        # save weights
+        save_files = {
+            'model': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'lr_scheduler': lr_scheduler.state_dict(),
+            'epoch': epoch}
+        if args.amp:
+            save_files["scaler"] = scaler.state_dict()
+        torch.save(save_files, "./save_weights/model-{}.pth".format(epoch))
+
+    # plot loss and lr curve
+    if len(train_loss) != 0 and len(learning_rate) != 0:
+        from plot_curve import plot_loss_and_lr
+        plot_loss_and_lr(train_loss, learning_rate)
+
+    # plot mAP curve
+    if len(val_map) != 0:
+        from plot_curve import plot_map
+        plot_map(val_map)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 训练设备类型
+    parser.add_argument('--device', default='cuda:0', help='device')
+    # 训练数据集的根目录(coco2017)
+    parser.add_argument('--data-path', default='/data/coco2017', help='dataset')
+    # COCO数据集人体关键点信息
+    parser.add_argument('--keypoints-path', default="./person_keypoints.json", type=str,
+                        help='person_keypoints.json path')
+    # 原项目提供的验证集person检测信息，如果要使用GT信息，直接将该参数置为None，建议设置成None
+    parser.add_argument('--person-det', type=str, default=None)
+    parser.add_argument('--fixed-size', default=[256, 192], nargs='+', type=int, help='input size')
+    # keypoints点数
+    parser.add_argument('--num-joints', default=17, type=int, help='num_joints')
+    # 文件保存地址
+    parser.add_argument('--output-dir', default='./save_weights', help='path where to save')
+    # 若需要接着上次训练，则指定上次训练保存权重文件地址
+    parser.add_argument('--resume', default='', type=str, help='resume from checkpoint')
+    # 指定接着从哪个epoch数开始训练
+    parser.add_argument('--start-epoch', default=0, type=int, help='start epoch')
+    # 训练的总epoch数
+    parser.add_argument('--epochs', default=210, type=int, metavar='N',
+                        help='number of total epochs to run')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-steps', default=[170, 200], nargs='+', type=int, help='decrease lr every step-size epochs')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
+    # 学习率
+    parser.add_argument('--lr', default=0.001, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                             'on 8 gpus and 2 images_per_gpu')
+    # AdamW的weight_decay参数
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    # 训练的batch size
+    parser.add_argument('--batch-size', default=32, type=int, metavar='N',
+                        help='batch size when training.')
+    # 是否使用混合精度训练(需要GPU支持混合精度)
+    parser.add_argument("--amp", action="http://github.com/store_true", help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+    print(args)
+
+    # 检查保存权重文件夹是否存在，不存在则创建
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    main(args)
diff --git a/pytorch_keypoint/HRNet/train_multi_GPU.py b/pytorch_keypoint/HRNet/train_multi_GPU.py
new file mode 100644
index 000000000..9235db6e1
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train_multi_GPU.py
@@ -0,0 +1,272 @@
+import json
+import time
+import os
+import datetime
+
+import torch
+from torch.utils import data
+import numpy as np
+
+import transforms
+from model import HighResolutionNet
+from my_dataset_coco import CocoKeypoint
+import train_utils.train_eval_utils as utils
+from train_utils import init_distributed_mode, save_on_master, mkdir
+
+
+def create_model(num_joints, load_pretrain_weights=True):
+    model = HighResolutionNet(base_channel=32, num_joints=num_joints)
+
+    if load_pretrain_weights:
+        # 载入预训练模型权重
+        # 链接:https://pan.baidu.com/s/1Lu6mMAWfm_8GGykttFMpVw 提取码:f43o
+        weights_dict = torch.load("./hrnet_w32.pth", map_location='cpu')
+
+        for k in list(weights_dict.keys()):
+            # 如果载入的是imagenet权重，就删除无用权重
+            if ("head" in k) or ("fc" in k):
+                del weights_dict[k]
+
+            # 如果载入的是coco权重，对比下num_joints，如果不相等就删除
+            if "final_layer" in k:
+                if weights_dict[k].shape[0] != num_joints:
+                    del weights_dict[k]
+
+        missing_keys, unexpected_keys = model.load_state_dict(weights_dict, strict=False)
+        if len(missing_keys) != 0:
+            print("missing_keys: ", missing_keys)
+
+    return model
+
+
+def main(args):
+    init_distributed_mode(args)
+    print(args)
+
+    device = torch.device(args.device)
+
+    # 用来保存coco_info的文件
+    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    key_results_file = f"results{now}.txt"
+
+    with open(args.keypoints_path, "r") as f:
+        person_kps_info = json.load(f)
+
+    fixed_size = args.fixed_size
+    heatmap_hw = (args.fixed_size[0] // 4, args.fixed_size[1] // 4)
+    kps_weights = np.array(person_kps_info["kps_weights"],
+                           dtype=np.float32).reshape((args.num_joints,))
+    data_transform = {
+        "train": transforms.Compose([
+            transforms.HalfBody(0.3, person_kps_info["upper_body_ids"], person_kps_info["lower_body_ids"]),
+            transforms.AffineTransform(scale=(0.65, 1.35), rotation=(-45, 45), fixed_size=fixed_size),
+            transforms.RandomHorizontalFlip(0.5, person_kps_info["flip_pairs"]),
+            transforms.KeypointToHeatMap(heatmap_hw=heatmap_hw, gaussian_sigma=2, keypoints_weights=kps_weights),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ]),
+        "val": transforms.Compose([
+            transforms.AffineTransform(scale=(1.25, 1.25), fixed_size=fixed_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+    }
+
+    data_root = args.data_path
+
+    # load train data set
+    # coco2017 -> annotations -> person_keypoints_train2017.json
+    train_dataset = CocoKeypoint(data_root, "train", transforms=data_transform["train"], fixed_size=args.fixed_size)
+
+    # load validation data set
+    # coco2017 -> annotations -> person_keypoints_val2017.json
+    val_dataset = CocoKeypoint(data_root, "val", transforms=data_transform["val"], fixed_size=args.fixed_size,
+                               det_json_path=args.person_det)
+
+    print("Creating data loaders")
+    if args.distributed:
+        train_sampler = data.distributed.DistributedSampler(train_dataset)
+        test_sampler = data.distributed.DistributedSampler(val_dataset)
+    else:
+        train_sampler = data.RandomSampler(train_dataset)
+        test_sampler = data.SequentialSampler(val_dataset)
+
+    train_batch_sampler = data.BatchSampler(train_sampler, args.batch_size, drop_last=True)
+
+    data_loader = data.DataLoader(train_dataset,
+                                  batch_sampler=train_batch_sampler,
+                                  num_workers=args.workers,
+                                  collate_fn=train_dataset.collate_fn)
+
+    data_loader_test = data.DataLoader(val_dataset,
+                                       batch_size=args.batch_size,
+                                       sampler=test_sampler,
+                                       num_workers=args.workers,
+                                       collate_fn=train_dataset.collate_fn)
+
+    print("Creating model")
+    # create model num_classes equal background + classes
+    model = create_model(num_joints=args.num_joints)
+    model.to(device)
+
+    if args.distributed and args.sync_bn:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.AdamW(params,
+                                  lr=args.lr,
+                                  weight_decay=args.weight_decay)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
+
+    # 如果传入resume参数，即上次训练的权重地址，则接着上次的参数训练
+    if args.resume:
+        # If map_location is missing, torch.load will first load the module to CPU
+        # and then copy each parameter to where it was saved,
+        # which would result in all processes on the same machine using the same set of devices.
+        checkpoint = torch.load(args.resume, map_location='cpu')  # 读取之前保存的权重文件(包括优化器以及学习率策略)
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp and "scaler" in checkpoint:
+            scaler.load_state_dict(checkpoint["scaler"])
+
+    if args.test_only:
+        utils.evaluate(model, data_loader_test, device=device,
+                       flip=True, flip_pairs=person_kps_info["flip_pairs"])
+        return
+
+    train_loss = []
+    learning_rate = []
+    val_map = []
+
+    print("Start training")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader,
+                                              device, epoch, args.print_freq,
+                                              warmup=True, scaler=scaler)
+
+        # update learning rate
+        lr_scheduler.step()
+
+        # evaluate after every epoch
+        key_info = utils.evaluate(model, data_loader_test, device=device,
+                                  flip=True, flip_pairs=person_kps_info["flip_pairs"])
+
+        # 只在主进程上进行写操作
+        if args.rank in [-1, 0]:
+            train_loss.append(mean_loss.item())
+            learning_rate.append(lr)
+            val_map.append(key_info[1])  # @0.5 mAP
+
+            # write into txt
+            with open(key_results_file, "a") as f:
+                # 写入的数据包括coco指标还有loss和learning rate
+                result_info = [f"{i:.4f}" for i in key_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+                txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+                f.write(txt + "\n")
+
+        if args.output_dir:
+            # 只在主进程上执行保存权重操作
+            save_files = {'model': model_without_ddp.state_dict(),
+                          'optimizer': optimizer.state_dict(),
+                          'lr_scheduler': lr_scheduler.state_dict(),
+                          'args': args,
+                          'epoch': epoch}
+            if args.amp:
+                save_files["scaler"] = scaler.state_dict()
+            save_on_master(save_files,
+                           os.path.join(args.output_dir, f'model_{epoch}.pth'))
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+    if args.rank in [-1, 0]:
+        # plot loss and lr curve
+        if len(train_loss) != 0 and len(learning_rate) != 0:
+            from plot_curve import plot_loss_and_lr
+            plot_loss_and_lr(train_loss, learning_rate)
+
+        # plot mAP curve
+        if len(val_map) != 0:
+            from plot_curve import plot_map
+            plot_map(val_map)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 训练文件的根目录(coco2017)
+    parser.add_argument('--data-path', default='/data/coco2017', help='dataset')
+    # 训练设备类型
+    parser.add_argument('--device', default='cuda', help='device')
+    # COCO数据集人体关键点信息
+    parser.add_argument('--keypoints-path', default="./person_keypoints.json", type=str,
+                        help='person_keypoints.json path')
+    # 原项目提供的验证集person检测信息，如果要使用GT信息，直接将该参数置为None，建议设置成None
+    parser.add_argument('--person-det', type=str, default=None)
+    parser.add_argument('--fixed-size', default=[256, 192], nargs='+', type=int, help='input size')
+    # 检测目标类别数(不包含背景)
+    parser.add_argument('--num-joints', default=17, type=int, help='num_joints(num_keypoints)')
+    # 每块GPU上的batch_size
+    parser.add_argument('-b', '--batch-size', default=32, type=int,
+                        help='images per gpu, the total batch size is $NGPU x batch_size')
+    # 指定接着从哪个epoch数开始训练
+    parser.add_argument('--start-epoch', default=0, type=int, help='start epoch')
+    # 训练的总epoch数
+    parser.add_argument('--epochs', default=210, type=int, metavar='N',
+                        help='number of total epochs to run')
+    # 数据加载以及预处理的线程数
+    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                        help='number of data loading workers (default: 4)')
+    # 学习率
+    parser.add_argument('--lr', default=0.001, type=float,
+                        help='initial learning rate, 0.001 is the default value for training '
+                             'on 4 gpus and 32 images_per_gpu')
+    # AdamW的weight_decay参数
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-steps', default=[170, 200], nargs='+', type=int,
+                        help='decrease lr every step-size epochs')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
+    # 训练过程打印信息的频率
+    parser.add_argument('--print-freq', default=50, type=int, help='print frequency')
+    # 文件保存地址
+    parser.add_argument('--output-dir', default='./multi_train', help='path where to save')
+    # 基于上次的训练结果接着训练
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--test-only', action="http://github.com/store_true", help="test only")
+
+    # 开启的进程数(注意不是线程)
+    parser.add_argument('--world-size', default=4, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+    parser.add_argument("--sync-bn", action="http://github.com/store_true", help="Use sync batch norm")
+    # 是否使用混合精度训练(需要GPU支持混合精度)
+    parser.add_argument("--amp", action="http://github.com/store_true", help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+
+    # 如果指定了保存文件地址，检查文件夹是否存在，若不存在，则创建
+    if args.output_dir:
+        mkdir(args.output_dir)
+
+    main(args)
diff --git a/pytorch_keypoint/HRNet/train_utils/__init__.py b/pytorch_keypoint/HRNet/train_utils/__init__.py
new file mode 100644
index 000000000..3dfa7eadc
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train_utils/__init__.py
@@ -0,0 +1,4 @@
+from .group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
+from .distributed_utils import init_distributed_mode, save_on_master, mkdir
+from .coco_eval import EvalCOCOMetric
+from .coco_utils import coco_remove_images_without_annotations, convert_coco_poly_mask, convert_to_coco_api
diff --git a/pytorch_keypoint/HRNet/train_utils/coco_eval.py b/pytorch_keypoint/HRNet/train_utils/coco_eval.py
new file mode 100644
index 000000000..99aff2c20
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train_utils/coco_eval.py
@@ -0,0 +1,132 @@
+import json
+import copy
+
+from PIL import Image, ImageDraw
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from .distributed_utils import all_gather, is_main_process
+from transforms import affine_points
+
+
+def merge(img_ids, eval_results):
+    """将多个进程之间的数据汇总在一起"""
+    all_img_ids = all_gather(img_ids)
+    all_eval_results = all_gather(eval_results)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_results = []
+    for p in all_eval_results:
+        merged_eval_results.extend(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+
+    # keep only unique (and in sorted order) images
+    # 去除重复的图片索引，多GPU训练时为了保证每个进程的训练图片数量相同，可能将一张图片分配给多个进程
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_results = [merged_eval_results[i] for i in idx]
+
+    return list(merged_img_ids), merged_eval_results
+
+
+class EvalCOCOMetric:
+    def __init__(self,
+                 coco: COCO = None,
+                 iou_type: str = "keypoints",
+                 results_file_name: str = "predict_results.json",
+                 classes_mapping: dict = None,
+                 threshold: float = 0.2):
+        self.coco = copy.deepcopy(coco)
+        self.obj_ids = []  # 记录每个进程处理目标(person)的ids
+        self.results = []
+        self.aggregation_results = None
+        self.classes_mapping = classes_mapping
+        self.coco_evaluator = None
+        assert iou_type in ["keypoints"]
+        self.iou_type = iou_type
+        self.results_file_name = results_file_name
+        self.threshold = threshold
+
+    def plot_img(self, img_path, keypoints, r=3):
+        img = Image.open(img_path)
+        draw = ImageDraw.Draw(img)
+        for i, point in enumerate(keypoints):
+            draw.ellipse([point[0] - r, point[1] - r, point[0] + r, point[1] + r],
+                         fill=(255, 0, 0))
+        img.show()
+
+    def prepare_for_coco_keypoints(self, targets, outputs):
+        # 遍历每个person的预测结果(注意这里不是每张，一张图片里可能有多个person)
+        for target, keypoints, scores in zip(targets, outputs[0], outputs[1]):
+            if len(keypoints) == 0:
+                continue
+
+            obj_idx = int(target["obj_index"])
+            if obj_idx in self.obj_ids:
+                # 防止出现重复的数据
+                continue
+
+            self.obj_ids.append(obj_idx)
+            # self.plot_img(target["image_path"], keypoints)
+
+            mask = np.greater(scores, 0.2)
+            if mask.sum() == 0:
+                k_score = 0
+            else:
+                k_score = np.mean(scores[mask])
+
+            keypoints = np.concatenate([keypoints, scores], axis=1)
+            keypoints = np.reshape(keypoints, -1)
+
+            # We recommend rounding coordinates to the nearest tenth of a pixel
+            # to reduce resulting JSON file size.
+            keypoints = [round(k, 2) for k in keypoints.tolist()]
+
+            res = {"image_id": target["image_id"],
+                   "category_id": 1,  # person
+                   "keypoints": keypoints,
+                   "score": target["score"] * k_score}
+
+            self.results.append(res)
+
+    def update(self, targets, outputs):
+        if self.iou_type == "keypoints":
+            self.prepare_for_coco_keypoints(targets, outputs)
+        else:
+            raise KeyError(f"not support iou_type: {self.iou_type}")
+
+    def synchronize_results(self):
+        # 同步所有进程中的数据
+        eval_ids, eval_results = merge(self.obj_ids, self.results)
+        self.aggregation_results = {"obj_ids": eval_ids, "results": eval_results}
+
+        # 主进程上保存即可
+        if is_main_process():
+            # results = []
+            # [results.extend(i) for i in eval_results]
+            # write predict results into json file
+            json_str = json.dumps(eval_results, indent=4)
+            with open(self.results_file_name, 'w') as json_file:
+                json_file.write(json_str)
+
+    def evaluate(self):
+        # 只在主进程上评估即可
+        if is_main_process():
+            # accumulate predictions from all images
+            coco_true = self.coco
+            coco_pre = coco_true.loadRes(self.results_file_name)
+
+            self.coco_evaluator = COCOeval(cocoGt=coco_true, cocoDt=coco_pre, iouType=self.iou_type)
+
+            self.coco_evaluator.evaluate()
+            self.coco_evaluator.accumulate()
+            print(f"IoU metric: {self.iou_type}")
+            self.coco_evaluator.summarize()
+
+            coco_info = self.coco_evaluator.stats.tolist()  # numpy to list
+            return coco_info
+        else:
+            return None
diff --git a/pytorch_keypoint/HRNet/train_utils/coco_utils.py b/pytorch_keypoint/HRNet/train_utils/coco_utils.py
new file mode 100644
index 000000000..7a3b3122e
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train_utils/coco_utils.py
@@ -0,0 +1,98 @@
+import torch
+import torch.utils.data
+from pycocotools import mask as coco_mask
+from pycocotools.coco import COCO
+
+
+def coco_remove_images_without_annotations(dataset, ids):
+    """
+    删除coco数据集中没有目标，或者目标面积非常小的数据
+    refer to:
+    https://github.com/pytorch/vision/blob/master/references/detection/coco_utils.py
+    :param dataset:
+    :param cat_list:
+    :return:
+    """
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+
+        return True
+
+    valid_ids = []
+    for ds_idx, img_id in enumerate(ids):
+        ann_ids = dataset.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.loadAnns(ann_ids)
+
+        if _has_valid_annotation(anno):
+            valid_ids.append(img_id)
+
+    return valid_ids
+
+
+def convert_coco_poly_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        # 如果mask为空，则说明没有目标，直接返回数值为0的mask
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+def convert_to_coco_api(self):
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {"images": [], "categories": [], "annotations": []}
+    categories = set()
+    for img_idx in range(len(self)):
+        targets, h, w = self.get_annotations(img_idx)
+        img_id = targets["image_id"].item()
+        img_dict = {"id": img_id,
+                    "height": h,
+                    "width": w}
+        dataset["images"].append(img_dict)
+        bboxes = targets["boxes"].clone()
+        # convert (x_min, ymin, xmax, ymax) to (xmin, ymin, w, h)
+        bboxes[:, 2:] -= bboxes[:, :2]
+        bboxes = bboxes.tolist()
+        labels = targets["labels"].tolist()
+        areas = targets["area"].tolist()
+        iscrowd = targets["iscrowd"].tolist()
+        if "masks" in targets:
+            masks = targets["masks"]
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {"image_id": img_id,
+                   "bbox": bboxes[i],
+                   "category_id": labels[i],
+                   "area": areas[i],
+                   "iscrowd": iscrowd[i],
+                   "id": ann_id}
+            categories.add(labels[i])
+            if "masks" in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            dataset["annotations"].append(ann)
+            ann_id += 1
+    dataset["categories"] = [{"id": i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
diff --git a/pytorch_keypoint/HRNet/train_utils/distributed_utils.py b/pytorch_keypoint/HRNet/train_utils/distributed_utils.py
new file mode 100644
index 000000000..514b8fd92
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train_utils/distributed_utils.py
@@ -0,0 +1,298 @@
+from collections import defaultdict, deque
+import datetime
+import pickle
+import time
+import errno
+import os
+
+import torch
+import torch.distributed as dist
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{value:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)  # deque简单理解成加强版list
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):  # @property 是装饰器，这里可简单理解为增加median属性(只读)
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    收集各个进程中的数据
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()  # 进程数
+    if world_size == 1:
+        return [data]
+
+    data_list = [None] * world_size
+    dist.all_gather_object(data_list, data)
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:  # 单GPU的情况
+        return input_dict
+    with torch.no_grad():  # 多GPU的情况
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+
+        reduced_dict = {k: v for k, v in zip(names, values)}
+        return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([header,
+                                           '[{0' + space_fmt + '}/{1}]',
+                                           'eta: {eta}',
+                                           '{meters}',
+                                           'time: {time}',
+                                           'data: {data}',
+                                           'max mem: {memory:.0f}'])
+        else:
+            log_msg = self.delimiter.join([header,
+                                           '[{0' + space_fmt + '}/{1}]',
+                                           'eta: {eta}',
+                                           '{meters}',
+                                           'time: {time}',
+                                           'data: {data}'])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_second = int(iter_time.global_avg * (len(iterable) - i))
+                eta_string = str(datetime.timedelta(seconds=eta_second))
+                if torch.cuda.is_available():
+                    print(log_msg.format(i, len(iterable),
+                                         eta=eta_string,
+                                         meters=str(self),
+                                         time=str(iter_time),
+                                         data=str(data_time),
+                                         memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(i, len(iterable),
+                                         eta=eta_string,
+                                         meters=str(self),
+                                         time=str(iter_time),
+                                         data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(header,
+                                                         total_time_str,
+                                                         total_time / len(iterable)))
+
+
+def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor):
+
+    def f(x):
+        """根据step数返回一个学习率倍率因子"""
+        if x >= warmup_iters:  # 当迭代数大于给定的warmup_iters时，倍率因子为1
+            return 1
+        alpha = float(x) / warmup_iters
+        # 迭代过程中倍率因子从warmup_factor -> 1
+        return warmup_factor * (1 - alpha) + alpha
+
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    """检查是否支持分布式环境"""
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
diff --git a/pytorch_keypoint/HRNet/train_utils/group_by_aspect_ratio.py b/pytorch_keypoint/HRNet/train_utils/group_by_aspect_ratio.py
new file mode 100644
index 000000000..e7b8b9e88
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train_utils/group_by_aspect_ratio.py
@@ -0,0 +1,201 @@
+import bisect
+from collections import defaultdict
+import copy
+from itertools import repeat, chain
+import math
+import numpy as np
+
+import torch
+import torch.utils.data
+from torch.utils.data.sampler import BatchSampler, Sampler
+from torch.utils.model_zoo import tqdm
+import torchvision
+
+from PIL import Image
+
+
+def _repeat_to_at_least(iterable, n):
+    repeat_times = math.ceil(n / len(iterable))
+    repeated = chain.from_iterable(repeat(iterable, repeat_times))
+    return list(repeated)
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    Arguments:
+        sampler (Sampler): Base sampler.
+        group_ids (list[int]): If the sampler produces indices in range [0, N),
+            `group_ids` must be a list of `N` ints which contains the group id of each sample.
+            The group ids must be a continuous set of integers starting from
+            0, i.e. they must be in the range [0, num_groups).
+        batch_size (int): Size of mini-batch.
+    """
+    def __init__(self, sampler, group_ids, batch_size):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = group_ids
+        self.batch_size = batch_size
+
+    def __iter__(self):
+        buffer_per_group = defaultdict(list)
+        samples_per_group = defaultdict(list)
+
+        num_batches = 0
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            buffer_per_group[group_id].append(idx)
+            samples_per_group[group_id].append(idx)
+            if len(buffer_per_group[group_id]) == self.batch_size:
+                yield buffer_per_group[group_id]
+                num_batches += 1
+                del buffer_per_group[group_id]
+            assert len(buffer_per_group[group_id]) < self.batch_size
+
+        # now we have run out of elements that satisfy
+        # the group criteria, let's return the remaining
+        # elements so that the size of the sampler is
+        # deterministic
+        expected_num_batches = len(self)
+        num_remaining = expected_num_batches - num_batches
+        if num_remaining > 0:
+            # for the remaining batches, take first the buffers with largest number
+            # of elements
+            for group_id, _ in sorted(buffer_per_group.items(),
+                                      key=lambda x: len(x[1]), reverse=True):
+                remaining = self.batch_size - len(buffer_per_group[group_id])
+                samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining)
+                buffer_per_group[group_id].extend(samples_from_group_id[:remaining])
+                assert len(buffer_per_group[group_id]) == self.batch_size
+                yield buffer_per_group[group_id]
+                num_remaining -= 1
+                if num_remaining == 0:
+                    break
+        assert num_remaining == 0
+
+    def __len__(self):
+        return len(self.sampler) // self.batch_size
+
+
+def _compute_aspect_ratios_slow(dataset, indices=None):
+    print("Your dataset doesn't support the fast path for "
+          "computing the aspect ratios, so will iterate over "
+          "the full dataset and load every image instead. "
+          "This might take some time...")
+    if indices is None:
+        indices = range(len(dataset))
+
+    class SubsetSampler(Sampler):
+        def __init__(self, indices):
+            self.indices = indices
+
+        def __iter__(self):
+            return iter(self.indices)
+
+        def __len__(self):
+            return len(self.indices)
+
+    sampler = SubsetSampler(indices)
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_size=1, sampler=sampler,
+        num_workers=14,  # you might want to increase it for faster processing
+        collate_fn=lambda x: x[0])
+    aspect_ratios = []
+    with tqdm(total=len(dataset)) as pbar:
+        for _i, (img, _) in enumerate(data_loader):
+            pbar.update(1)
+            height, width = img.shape[-2:]
+            aspect_ratio = float(width) / float(height)
+            aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_custom_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        height, width = dataset.get_height_and_width(i)
+        aspect_ratio = float(width) / float(height)
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_coco_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        img_info = dataset.coco.imgs[dataset.ids[i]]
+        aspect_ratio = float(img_info["width"]) / float(img_info["height"])
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_voc_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        # this doesn't load the data into memory, because PIL loads it lazily
+        width, height = Image.open(dataset.images[i]).size
+        aspect_ratio = float(width) / float(height)
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_subset_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+
+    ds_indices = [dataset.indices[i] for i in indices]
+    return compute_aspect_ratios(dataset.dataset, ds_indices)
+
+
+def compute_aspect_ratios(dataset, indices=None):
+    if hasattr(dataset, "get_height_and_width"):
+        return _compute_aspect_ratios_custom_dataset(dataset, indices)
+
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return _compute_aspect_ratios_coco_dataset(dataset, indices)
+
+    if isinstance(dataset, torchvision.datasets.VOCDetection):
+        return _compute_aspect_ratios_voc_dataset(dataset, indices)
+
+    if isinstance(dataset, torch.utils.data.Subset):
+        return _compute_aspect_ratios_subset_dataset(dataset, indices)
+
+    # slow path
+    return _compute_aspect_ratios_slow(dataset, indices)
+
+
+def _quantize(x, bins):
+    bins = copy.deepcopy(bins)
+    bins = sorted(bins)
+    # bisect_right：寻找y元素按顺序应该排在bins中哪个元素的右边，返回的是索引
+    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    return quantized
+
+
+def create_aspect_ratio_groups(dataset, k=0):
+    # 计算所有数据集中的图片width/height比例
+    aspect_ratios = compute_aspect_ratios(dataset)
+    # 将[0.5, 2]区间划分成2*k+1等份
+    bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0]
+
+    # 统计所有图像比例在bins区间中的位置索引
+    groups = _quantize(aspect_ratios, bins)
+    # count number of elements per group
+    # 统计每个区间的频次
+    counts = np.unique(groups, return_counts=True)[1]
+    fbins = [0] + bins + [np.inf]
+    print("Using {} as bins for aspect ratio quantization".format(fbins))
+    print("Count of instances per bin: {}".format(counts))
+    return groups
diff --git a/pytorch_keypoint/HRNet/train_utils/loss.py b/pytorch_keypoint/HRNet/train_utils/loss.py
new file mode 100644
index 000000000..1628dbf9a
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train_utils/loss.py
@@ -0,0 +1,20 @@
+import torch
+
+
+class KpLoss(object):
+    def __init__(self):
+        self.criterion = torch.nn.MSELoss(reduction='none')
+
+    def __call__(self, logits, targets):
+        assert len(logits.shape) == 4, 'logits should be 4-ndim'
+        device = logits.device
+        bs = logits.shape[0]
+        # [num_kps, H, W] -> [B, num_kps, H, W]
+        heatmaps = torch.stack([t["heatmap"].to(device) for t in targets])
+        # [num_kps] -> [B, num_kps]
+        kps_weights = torch.stack([t["kps_weights"].to(device) for t in targets])
+
+        # [B, num_kps, H, W] -> [B, num_kps]
+        loss = self.criterion(logits, heatmaps).mean(dim=[2, 3])
+        loss = torch.sum(loss * kps_weights) / bs
+        return loss
diff --git a/pytorch_keypoint/HRNet/train_utils/train_eval_utils.py b/pytorch_keypoint/HRNet/train_utils/train_eval_utils.py
new file mode 100644
index 000000000..5f678b8de
--- /dev/null
+++ b/pytorch_keypoint/HRNet/train_utils/train_eval_utils.py
@@ -0,0 +1,119 @@
+import math
+import sys
+import time
+
+import torch
+
+import transforms
+import train_utils.distributed_utils as utils
+from .coco_eval import EvalCOCOMetric
+from .loss import KpLoss
+
+
+def train_one_epoch(model, optimizer, data_loader, device, epoch,
+                    print_freq=50, warmup=False, scaler=None):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    lr_scheduler = None
+    if epoch == 0 and warmup is True:  # 当训练第一轮（epoch=0）时，启用warmup训练方式，可理解为热身训练
+        warmup_factor = 1.0 / 1000
+        warmup_iters = min(1000, len(data_loader) - 1)
+
+        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
+
+    mse = KpLoss()
+    mloss = torch.zeros(1).to(device)  # mean losses
+    for i, [images, targets] in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+        images = torch.stack([image.to(device) for image in images])
+
+        # 混合精度训练上下文管理器，如果在CPU环境中不起任何作用
+        with torch.cuda.amp.autocast(enabled=scaler is not None):
+            results = model(images)
+
+            losses = mse(results, targets)
+
+        # reduce losses over all GPUs for logging purpose
+        loss_dict_reduced = utils.reduce_dict({"losses": losses})
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        loss_value = losses_reduced.item()
+        # 记录训练损失
+        mloss = (mloss * i + loss_value) / (i + 1)  # update mean losses
+
+        if not math.isfinite(loss_value):  # 当计算的损失为无穷大时停止训练
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        optimizer.zero_grad()
+        if scaler is not None:
+            scaler.scale(losses).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            losses.backward()
+            optimizer.step()
+
+        if lr_scheduler is not None:  # 第一轮使用warmup训练方式
+            lr_scheduler.step()
+
+        metric_logger.update(loss=losses_reduced)
+        now_lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=now_lr)
+
+    return mloss, now_lr
+
+
+@torch.no_grad()
+def evaluate(model, data_loader, device, flip=False, flip_pairs=None):
+    if flip:
+        assert flip_pairs is not None, "enable flip must provide flip_pairs."
+
+    model.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = "Test: "
+
+    key_metric = EvalCOCOMetric(data_loader.dataset.coco, "keypoints", "key_results.json")
+    for image, targets in metric_logger.log_every(data_loader, 100, header):
+        images = torch.stack([img.to(device) for img in image])
+
+        # 当使用CPU时，跳过GPU相关指令
+        if device != torch.device("cpu"):
+            torch.cuda.synchronize(device)
+
+        model_time = time.time()
+        outputs = model(images)
+        if flip:
+            flipped_images = transforms.flip_images(images)
+            flipped_outputs = model(flipped_images)
+            flipped_outputs = transforms.flip_back(flipped_outputs, flip_pairs)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            # https://github.com/leoxiaobin/deep-high-resolution-net.pytorch/issues/22
+            flipped_outputs[..., 1:] = flipped_outputs.clone()[..., 0:-1]
+            outputs = (outputs + flipped_outputs) * 0.5
+
+        model_time = time.time() - model_time
+
+        # decode keypoint
+        reverse_trans = [t["reverse_trans"] for t in targets]
+        outputs = transforms.get_final_preds(outputs, reverse_trans, post_processing=True)
+
+        key_metric.update(targets, outputs)
+        metric_logger.update(model_time=model_time)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+
+    # 同步所有进程中的数据
+    key_metric.synchronize_results()
+
+    if utils.is_main_process():
+        coco_info = key_metric.evaluate()
+    else:
+        coco_info = None
+
+    return coco_info
diff --git a/pytorch_keypoint/HRNet/transforms.py b/pytorch_keypoint/HRNet/transforms.py
new file mode 100644
index 000000000..b914e2fe3
--- /dev/null
+++ b/pytorch_keypoint/HRNet/transforms.py
@@ -0,0 +1,443 @@
+import math
+import random
+from typing import Tuple
+
+import cv2
+import numpy as np
+import torch
+from torchvision.transforms import functional as F
+import matplotlib.pyplot as plt
+
+
+def flip_images(img):
+    assert len(img.shape) == 4, 'images has to be [batch_size, channels, height, width]'
+    img = torch.flip(img, dims=[3])
+    return img
+
+
+def flip_back(output_flipped, matched_parts):
+    assert len(output_flipped.shape) == 4, 'output_flipped has to be [batch_size, num_joints, height, width]'
+    output_flipped = torch.flip(output_flipped, dims=[3])
+
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0]].clone()
+        output_flipped[:, pair[0]] = output_flipped[:, pair[1]]
+        output_flipped[:, pair[1]] = tmp
+
+    return output_flipped
+
+
+def get_max_preds(batch_heatmaps):
+    """
+    get predictions from score maps
+    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+    """
+    assert isinstance(batch_heatmaps, torch.Tensor), 'batch_heatmaps should be torch.Tensor'
+    assert len(batch_heatmaps.shape) == 4, 'batch_images should be 4-ndim'
+
+    batch_size, num_joints, h, w = batch_heatmaps.shape
+    heatmaps_reshaped = batch_heatmaps.reshape(batch_size, num_joints, -1)
+    maxvals, idx = torch.max(heatmaps_reshaped, dim=2)
+
+    maxvals = maxvals.unsqueeze(dim=-1)
+    idx = idx.float()
+
+    preds = torch.zeros((batch_size, num_joints, 2)).to(batch_heatmaps)
+
+    preds[:, :, 0] = idx % w  # column 对应最大值的x坐标
+    preds[:, :, 1] = torch.floor(idx / w)  # row 对应最大值的y坐标
+
+    pred_mask = torch.gt(maxvals, 0.0).repeat(1, 1, 2).float().to(batch_heatmaps.device)
+
+    preds *= pred_mask
+    return preds, maxvals
+
+
+def affine_points(pt, t):
+    ones = np.ones((pt.shape[0], 1), dtype=float)
+    pt = np.concatenate([pt, ones], axis=1).T
+    new_pt = np.dot(t, pt)
+    return new_pt.T
+
+
+def get_final_preds(batch_heatmaps: torch.Tensor,
+                    trans: list = None,
+                    post_processing: bool = False):
+    assert trans is not None
+    coords, maxvals = get_max_preds(batch_heatmaps)
+
+    heatmap_height = batch_heatmaps.shape[2]
+    heatmap_width = batch_heatmaps.shape[3]
+
+    # post-processing
+    if post_processing:
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                hm = batch_heatmaps[n][p]
+                px = int(math.floor(coords[n][p][0] + 0.5))
+                py = int(math.floor(coords[n][p][1] + 0.5))
+                if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
+                    diff = torch.tensor(
+                        [
+                            hm[py][px + 1] - hm[py][px - 1],
+                            hm[py + 1][px] - hm[py - 1][px]
+                        ]
+                    ).to(batch_heatmaps.device)
+                    coords[n][p] += torch.sign(diff) * .25
+
+    preds = coords.clone().cpu().numpy()
+
+    # Transform back
+    for i in range(coords.shape[0]):
+        preds[i] = affine_points(preds[i], trans[i])
+
+    return preds, maxvals.cpu().numpy()
+
+
+def decode_keypoints(outputs, origin_hw, num_joints: int = 17):
+    keypoints = []
+    scores = []
+    heatmap_h, heatmap_w = outputs.shape[-2:]
+    for i in range(num_joints):
+        pt = np.unravel_index(np.argmax(outputs[i]), (heatmap_h, heatmap_w))
+        score = outputs[i, pt[0], pt[1]]
+        keypoints.append(pt[::-1])  # hw -> wh(xy)
+        scores.append(score)
+
+    keypoints = np.array(keypoints, dtype=float)
+    scores = np.array(scores, dtype=float)
+    # convert to full image scale
+    keypoints[:, 0] = np.clip(keypoints[:, 0] / heatmap_w * origin_hw[1],
+                              a_min=0,
+                              a_max=origin_hw[1])
+    keypoints[:, 1] = np.clip(keypoints[:, 1] / heatmap_h * origin_hw[0],
+                              a_min=0,
+                              a_max=origin_hw[0])
+    return keypoints, scores
+
+
+def resize_pad(img: np.ndarray, size: tuple):
+    h, w, c = img.shape
+    src = np.array([[0, 0],       # 原坐标系中图像左上角点
+                    [w - 1, 0],   # 原坐标系中图像右上角点
+                    [0, h - 1]],  # 原坐标系中图像左下角点
+                   dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    if h / w > size[0] / size[1]:
+        # 需要在w方向padding
+        wi = size[0] * (w / h)
+        pad_w = (size[1] - wi) / 2
+        dst[0, :] = [pad_w - 1, 0]            # 目标坐标系中图像左上角点
+        dst[1, :] = [size[1] - pad_w - 1, 0]  # 目标坐标系中图像右上角点
+        dst[2, :] = [pad_w - 1, size[0] - 1]  # 目标坐标系中图像左下角点
+    else:
+        # 需要在h方向padding
+        hi = size[1] * (h / w)
+        pad_h = (size[0] - hi) / 2
+        dst[0, :] = [0, pad_h - 1]            # 目标坐标系中图像左上角点
+        dst[1, :] = [size[1] - 1, pad_h - 1]  # 目标坐标系中图像右上角点
+        dst[2, :] = [0, size[0] - pad_h - 1]  # 目标坐标系中图像左下角点
+
+    trans = cv2.getAffineTransform(src, dst)  # 计算正向仿射变换矩阵
+    # 对图像进行仿射变换
+    resize_img = cv2.warpAffine(img,
+                                trans,
+                                size[::-1],  # w, h
+                                flags=cv2.INTER_LINEAR)
+    # import matplotlib.pyplot as plt
+    # plt.imshow(resize_img)
+    # plt.show()
+
+    dst /= 4  # 网络预测的heatmap尺寸是输入图像的1/4
+    reverse_trans = cv2.getAffineTransform(dst, src)  # 计算逆向仿射变换矩阵，方便后续还原
+
+    return resize_img, reverse_trans
+
+
+def adjust_box(xmin: float, ymin: float, w: float, h: float, fixed_size: Tuple[float, float]):
+    """通过增加w或者h的方式保证输入图片的长宽比固定"""
+    xmax = xmin + w
+    ymax = ymin + h
+
+    hw_ratio = fixed_size[0] / fixed_size[1]
+    if h / w > hw_ratio:
+        # 需要在w方向padding
+        wi = h / hw_ratio
+        pad_w = (wi - w) / 2
+        xmin = xmin - pad_w
+        xmax = xmax + pad_w
+    else:
+        # 需要在h方向padding
+        hi = w * hw_ratio
+        pad_h = (hi - h) / 2
+        ymin = ymin - pad_h
+        ymax = ymax + pad_h
+
+    return xmin, ymin, xmax, ymax
+
+
+def scale_box(xmin: float, ymin: float, w: float, h: float, scale_ratio: Tuple[float, float]):
+    """根据传入的h、w缩放因子scale_ratio，重新计算xmin，ymin，w，h"""
+    s_h = h * scale_ratio[0]
+    s_w = w * scale_ratio[1]
+    xmin = xmin - (s_w - w) / 2.
+    ymin = ymin - (s_h - h) / 2.
+    return xmin, ymin, s_w, s_h
+
+
+def plot_heatmap(image, heatmap, kps, kps_weights):
+    for kp_id in range(len(kps_weights)):
+        if kps_weights[kp_id] > 0:
+            plt.subplot(1, 2, 1)
+            plt.imshow(image)
+            plt.plot(*kps[kp_id].tolist(), "ro")
+            plt.title("image")
+            plt.subplot(1, 2, 2)
+            plt.imshow(heatmap[kp_id], cmap=plt.cm.Blues)
+            plt.colorbar(ticks=[0, 1])
+            plt.title(f"kp_id: {kp_id}")
+            plt.show()
+
+
+class Compose(object):
+    """组合多个transform函数"""
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+
+class ToTensor(object):
+    """将PIL图像转为Tensor"""
+    def __call__(self, image, target):
+        image = F.to_tensor(image)
+        return image, target
+
+
+class Normalize(object):
+    def __init__(self, mean=None, std=None):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        return image, target
+
+
+class HalfBody(object):
+    def __init__(self, p: float = 0.3, upper_body_ids=None, lower_body_ids=None):
+        assert upper_body_ids is not None
+        assert lower_body_ids is not None
+        self.p = p
+        self.upper_body_ids = upper_body_ids
+        self.lower_body_ids = lower_body_ids
+
+    def __call__(self, image, target):
+        if random.random() < self.p:
+            kps = target["keypoints"]
+            vis = target["visible"]
+            upper_kps = []
+            lower_kps = []
+
+            # 对可见的keypoints进行归类
+            for i, v in enumerate(vis):
+                if v > 0.5:
+                    if i in self.upper_body_ids:
+                        upper_kps.append(kps[i])
+                    else:
+                        lower_kps.append(kps[i])
+
+            # 50%的概率选择上或下半身
+            if random.random() < 0.5:
+                selected_kps = upper_kps
+            else:
+                selected_kps = lower_kps
+
+            # 如果点数太少就不做任何处理
+            if len(selected_kps) > 2:
+                selected_kps = np.array(selected_kps, dtype=np.float32)
+                xmin, ymin = np.min(selected_kps, axis=0).tolist()
+                xmax, ymax = np.max(selected_kps, axis=0).tolist()
+                w = xmax - xmin
+                h = ymax - ymin
+                if w > 1 and h > 1:
+                    # 把w和h适当放大点，要不然关键点处于边缘位置
+                    xmin, ymin, w, h = scale_box(xmin, ymin, w, h, (1.5, 1.5))
+                    target["box"] = [xmin, ymin, w, h]
+
+        return image, target
+
+
+class AffineTransform(object):
+    """scale+rotation"""
+    def __init__(self,
+                 scale: Tuple[float, float] = None,  # e.g. (0.65, 1.35)
+                 rotation: Tuple[int, int] = None,   # e.g. (-45, 45)
+                 fixed_size: Tuple[int, int] = (256, 192)):
+        self.scale = scale
+        self.rotation = rotation
+        self.fixed_size = fixed_size
+
+    def __call__(self, img, target):
+        src_xmin, src_ymin, src_xmax, src_ymax = adjust_box(*target["box"], fixed_size=self.fixed_size)
+        src_w = src_xmax - src_xmin
+        src_h = src_ymax - src_ymin
+        src_center = np.array([(src_xmin + src_xmax) / 2, (src_ymin + src_ymax) / 2])
+        src_p2 = src_center + np.array([0, -src_h / 2])  # top middle
+        src_p3 = src_center + np.array([src_w / 2, 0])   # right middle
+
+        dst_center = np.array([(self.fixed_size[1] - 1) / 2, (self.fixed_size[0] - 1) / 2])
+        dst_p2 = np.array([(self.fixed_size[1] - 1) / 2, 0])  # top middle
+        dst_p3 = np.array([self.fixed_size[1] - 1, (self.fixed_size[0] - 1) / 2])  # right middle
+
+        if self.scale is not None:
+            scale = random.uniform(*self.scale)
+            src_w = src_w * scale
+            src_h = src_h * scale
+            src_p2 = src_center + np.array([0, -src_h / 2])  # top middle
+            src_p3 = src_center + np.array([src_w / 2, 0])   # right middle
+
+        if self.rotation is not None:
+            angle = random.randint(*self.rotation)  # 角度制
+            angle = angle / 180 * math.pi  # 弧度制
+            src_p2 = src_center + np.array([src_h / 2 * math.sin(angle), -src_h / 2 * math.cos(angle)])
+            src_p3 = src_center + np.array([src_w / 2 * math.cos(angle), src_w / 2 * math.sin(angle)])
+
+        src = np.stack([src_center, src_p2, src_p3]).astype(np.float32)
+        dst = np.stack([dst_center, dst_p2, dst_p3]).astype(np.float32)
+
+        trans = cv2.getAffineTransform(src, dst)  # 计算正向仿射变换矩阵
+        dst /= 4  # 网络预测的heatmap尺寸是输入图像的1/4
+        reverse_trans = cv2.getAffineTransform(dst, src)  # 计算逆向仿射变换矩阵，方便后续还原
+
+        # 对图像进行仿射变换
+        resize_img = cv2.warpAffine(img,
+                                    trans,
+                                    tuple(self.fixed_size[::-1]),  # [w, h]
+                                    flags=cv2.INTER_LINEAR)
+
+        if "keypoints" in target:
+            kps = target["keypoints"]
+            mask = np.logical_and(kps[:, 0] != 0, kps[:, 1] != 0)
+            kps[mask] = affine_points(kps[mask], trans)
+            target["keypoints"] = kps
+
+        # import matplotlib.pyplot as plt
+        # from draw_utils import draw_keypoints
+        # resize_img = draw_keypoints(resize_img, target["keypoints"])
+        # plt.imshow(resize_img)
+        # plt.show()
+
+        target["trans"] = trans
+        target["reverse_trans"] = reverse_trans
+        return resize_img, target
+
+
+class RandomHorizontalFlip(object):
+    """随机对输入图片进行水平翻转，注意该方法必须接在 AffineTransform 后"""
+    def __init__(self, p: float = 0.5, matched_parts: list = None):
+        assert matched_parts is not None
+        self.p = p
+        self.matched_parts = matched_parts
+
+    def __call__(self, image, target):
+        if random.random() < self.p:
+            # [h, w, c]
+            image = np.ascontiguousarray(np.flip(image, axis=[1]))
+            keypoints = target["keypoints"]
+            visible = target["visible"]
+            width = image.shape[1]
+
+            # Flip horizontal
+            keypoints[:, 0] = width - keypoints[:, 0] - 1
+
+            # Change left-right parts
+            for pair in self.matched_parts:
+                keypoints[pair[0], :], keypoints[pair[1], :] = \
+                    keypoints[pair[1], :], keypoints[pair[0], :].copy()
+
+                visible[pair[0]], visible[pair[1]] = \
+                    visible[pair[1]], visible[pair[0]].copy()
+
+            target["keypoints"] = keypoints
+            target["visible"] = visible
+
+        return image, target
+
+
+class KeypointToHeatMap(object):
+    def __init__(self,
+                 heatmap_hw: Tuple[int, int] = (256 // 4, 192 // 4),
+                 gaussian_sigma: int = 2,
+                 keypoints_weights=None):
+        self.heatmap_hw = heatmap_hw
+        self.sigma = gaussian_sigma
+        self.kernel_radius = self.sigma * 3
+        self.use_kps_weights = False if keypoints_weights is None else True
+        self.kps_weights = keypoints_weights
+
+        # generate gaussian kernel(not normalized)
+        kernel_size = 2 * self.kernel_radius + 1
+        kernel = np.zeros((kernel_size, kernel_size), dtype=np.float32)
+        x_center = y_center = kernel_size // 2
+        for x in range(kernel_size):
+            for y in range(kernel_size):
+                kernel[y, x] = np.exp(-((x - x_center) ** 2 + (y - y_center) ** 2) / (2 * self.sigma ** 2))
+        # print(kernel)
+
+        self.kernel = kernel
+
+    def __call__(self, image, target):
+        kps = target["keypoints"]
+        num_kps = kps.shape[0]
+        kps_weights = np.ones((num_kps,), dtype=np.float32)
+        if "visible" in target:
+            visible = target["visible"]
+            kps_weights = visible
+
+        heatmap = np.zeros((num_kps, self.heatmap_hw[0], self.heatmap_hw[1]), dtype=np.float32)
+        heatmap_kps = (kps / 4 + 0.5).astype(np.int)  # round
+        for kp_id in range(num_kps):
+            v = kps_weights[kp_id]
+            if v < 0.5:
+                # 如果该点的可见度很低，则直接忽略
+                continue
+
+            x, y = heatmap_kps[kp_id]
+            ul = [x - self.kernel_radius, y - self.kernel_radius]  # up-left x,y
+            br = [x + self.kernel_radius, y + self.kernel_radius]  # bottom-right x,y
+            # 如果以xy为中心kernel_radius为半径的辐射范围内与heatmap没交集，则忽略该点(该规则并不严格)
+            if ul[0] > self.heatmap_hw[1] - 1 or \
+                    ul[1] > self.heatmap_hw[0] - 1 or \
+                    br[0] < 0 or \
+                    br[1] < 0:
+                # If not, just return the image as is
+                kps_weights[kp_id] = 0
+                continue
+
+            # Usable gaussian range
+            # 计算高斯核有效区域（高斯核坐标系）
+            g_x = (max(0, -ul[0]), min(br[0], self.heatmap_hw[1] - 1) - ul[0])
+            g_y = (max(0, -ul[1]), min(br[1], self.heatmap_hw[0] - 1) - ul[1])
+            # image range
+            # 计算heatmap中的有效区域（heatmap坐标系）
+            img_x = (max(0, ul[0]), min(br[0], self.heatmap_hw[1] - 1))
+            img_y = (max(0, ul[1]), min(br[1], self.heatmap_hw[0] - 1))
+
+            if kps_weights[kp_id] > 0.5:
+                # 将高斯核有效区域复制到heatmap对应区域
+                heatmap[kp_id][img_y[0]:img_y[1] + 1, img_x[0]:img_x[1] + 1] = \
+                    self.kernel[g_y[0]:g_y[1] + 1, g_x[0]:g_x[1] + 1]
+
+        if self.use_kps_weights:
+            kps_weights = np.multiply(kps_weights, self.kps_weights)
+
+        # plot_heatmap(image, heatmap, kps, kps_weights)
+
+        target["heatmap"] = torch.as_tensor(heatmap, dtype=torch.float32)
+        target["kps_weights"] = torch.as_tensor(kps_weights, dtype=torch.float32)
+
+        return image, target
diff --git a/pytorch_keypoint/HRNet/validation.py b/pytorch_keypoint/HRNet/validation.py
new file mode 100644
index 000000000..63d7611d0
--- /dev/null
+++ b/pytorch_keypoint/HRNet/validation.py
@@ -0,0 +1,205 @@
+"""
+该脚本用于调用训练好的模型权重去计算验证集/测试集的COCO指标
+"""
+
+import os
+import json
+
+import torch
+from tqdm import tqdm
+import numpy as np
+
+from model import HighResolutionNet
+from train_utils import EvalCOCOMetric
+from my_dataset_coco import CocoKeypoint
+import transforms
+
+
+def summarize(self, catId=None):
+    """
+    Compute and display summary metrics for evaluation results.
+    Note this functin can *only* be applied on the default parameter setting
+    """
+
+    def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+        p = self.params
+        iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
+        titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+        typeStr = '(AP)' if ap == 1 else '(AR)'
+        iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+            if iouThr is None else '{:0.2f}'.format(iouThr)
+
+        aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+        mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+
+        if ap == 1:
+            # dimension of precision: [TxRxKxAxM]
+            s = self.eval['precision']
+            # IoU
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+
+            if isinstance(catId, int):
+                s = s[:, :, catId, aind, mind]
+            else:
+                s = s[:, :, :, aind, mind]
+
+        else:
+            # dimension of recall: [TxKxAxM]
+            s = self.eval['recall']
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+
+            if isinstance(catId, int):
+                s = s[:, catId, aind, mind]
+            else:
+                s = s[:, :, aind, mind]
+
+        if len(s[s > -1]) == 0:
+            mean_s = -1
+        else:
+            mean_s = np.mean(s[s > -1])
+
+        print_string = iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)
+        return mean_s, print_string
+
+    stats, print_list = [0] * 10, [""] * 10
+    stats[0], print_list[0] = _summarize(1, maxDets=20)
+    stats[1], print_list[1] = _summarize(1, maxDets=20, iouThr=.5)
+    stats[2], print_list[2] = _summarize(1, maxDets=20, iouThr=.75)
+    stats[3], print_list[3] = _summarize(1, maxDets=20, areaRng='medium')
+    stats[4], print_list[4] = _summarize(1, maxDets=20, areaRng='large')
+    stats[5], print_list[5] = _summarize(0, maxDets=20)
+    stats[6], print_list[6] = _summarize(0, maxDets=20, iouThr=.5)
+    stats[7], print_list[7] = _summarize(0, maxDets=20, iouThr=.75)
+    stats[8], print_list[8] = _summarize(0, maxDets=20, areaRng='medium')
+    stats[9], print_list[9] = _summarize(0, maxDets=20, areaRng='large')
+
+    print_info = "\n".join(print_list)
+
+    if not self.eval:
+        raise Exception('Please run accumulate() first')
+
+    return stats, print_info
+
+
+def save_info(coco_evaluator,
+              save_name: str = "record_mAP.txt"):
+    # calculate COCO info for all keypoints
+    coco_stats, print_coco = summarize(coco_evaluator)
+
+    # 将验证结果保存至txt文件中
+    with open(save_name, "w") as f:
+        record_lines = ["COCO results:", print_coco]
+        f.write("\n".join(record_lines))
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    print("Using {} device training.".format(device.type))
+
+    data_transform = {
+        "val": transforms.Compose([
+            transforms.AffineTransform(scale=(1.25, 1.25), fixed_size=args.resize_hw),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+    }
+
+    # read class_indict
+    label_json_path = args.label_json_path
+    assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
+    with open(label_json_path, 'r') as f:
+        person_coco_info = json.load(f)
+
+    data_root = args.data_path
+
+    # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using %g dataloader workers' % nw)
+
+    # load validation data set
+    val_dataset = CocoKeypoint(data_root, "val", transforms=data_transform["val"], det_json_path=None)
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
+    # val_dataset = VOCInstances(data_root, year="2012", txt_name="val.txt", transforms=data_transform["val"])
+    val_dataset_loader = torch.utils.data.DataLoader(val_dataset,
+                                                     batch_size=batch_size,
+                                                     shuffle=False,
+                                                     pin_memory=True,
+                                                     num_workers=nw,
+                                                     collate_fn=val_dataset.collate_fn)
+
+    # create model
+    model = HighResolutionNet()
+
+    # 载入你自己训练好的模型权重
+    weights_path = args.weights_path
+    assert os.path.exists(weights_path), "not found {} file.".format(weights_path)
+    model.load_state_dict(torch.load(weights_path, map_location='cpu'))
+    # print(model)
+    model.to(device)
+
+    # evaluate on the val dataset
+    key_metric = EvalCOCOMetric(val_dataset.coco, "keypoints", "key_results.json")
+    model.eval()
+    with torch.no_grad():
+        for images, targets in tqdm(val_dataset_loader, desc="validation..."):
+            # 将图片传入指定设备device
+            images = images.to(device)
+
+            # inference
+            outputs = model(images)
+            if args.flip:
+                flipped_images = transforms.flip_images(images)
+                flipped_outputs = model(flipped_images)
+                flipped_outputs = transforms.flip_back(flipped_outputs, person_coco_info["flip_pairs"])
+                # feature is not aligned, shift flipped heatmap for higher accuracy
+                # https://github.com/leoxiaobin/deep-high-resolution-net.pytorch/issues/22
+                flipped_outputs[..., 1:] = flipped_outputs.clone()[..., 0:-1]
+                outputs = (outputs + flipped_outputs) * 0.5
+
+            # decode keypoint
+            reverse_trans = [t["reverse_trans"] for t in targets]
+            outputs = transforms.get_final_preds(outputs, reverse_trans, post_processing=True)
+
+            key_metric.update(targets, outputs)
+
+    key_metric.synchronize_results()
+    key_metric.evaluate()
+
+    save_info(key_metric.coco_evaluator, "keypoint_record_mAP.txt")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 使用设备类型
+    parser.add_argument('--device', default='cuda:0', help='device')
+
+    parser.add_argument('--resize-hw', type=list, default=[256, 192], help="resize for predict")
+    # 是否开启图像翻转
+    parser.add_argument('--flip', type=bool, default=True, help='whether using flipped images')
+
+    # 数据集的根目录
+    parser.add_argument('--data-path', default='/data/coco2017', help='dataset root')
+
+    # 训练好的权重文件
+    parser.add_argument('--weights-path', default='./pose_hrnet_w32_256x192.pth', type=str, help='training weights')
+
+    # batch size
+    parser.add_argument('--batch-size', default=1, type=int, metavar='N',
+                        help='batch size when validation.')
+    # 类别索引和类别名称对应关系
+    parser.add_argument('--label-json-path', type=str, default="person_keypoints.json")
+    # 原项目提供的验证集person检测信息，如果要使用GT信息，直接将该参数置为None
+    parser.add_argument('--person-det', type=str, default="./COCO_val2017_detections_AP_H_56_person.json")
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/pytorch_object_detection/faster_rcnn/README.md b/pytorch_object_detection/faster_rcnn/README.md
index c674ca58a..08ac15cf0 100644
--- a/pytorch_object_detection/faster_rcnn/README.md
+++ b/pytorch_object_detection/faster_rcnn/README.md
@@ -6,10 +6,10 @@
 ## 环境配置：
 * Python3.6/3.7/3.8
 * Pytorch1.7.1(注意：必须是1.6.0或以上，因为使用官方提供的混合精度训练1.6.0后才支持)
-* pycocotools(Linux:```pip install pycocotools```; Windows:```pip install pycocotools-windows```(不需要额外安装vs))
+* pycocotools(Linux:`pip install pycocotools`; Windows:`pip install pycocotools-windows`(不需要额外安装vs))
 * Ubuntu或Centos(不建议Windows)
 * 最好使用GPU训练
-* 详细环境配置见```requirements.txt```
+* 详细环境配置见`requirements.txt`
 
 ## 文件结构：
 ```
@@ -26,10 +26,11 @@
 ```
 
 ## 预训练权重下载地址（下载后放入backbone文件夹中）：
-* MobileNetV2 backbone: https://download.pytorch.org/models/mobilenet_v2-b0353104.pth
-* ResNet50+FPN backbone: https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
-* 注意，下载的预训练权重记得要重命名，比如在train_resnet50_fpn.py中读取的是```fasterrcnn_resnet50_fpn_coco.pth```文件，
-  不是```fasterrcnn_resnet50_fpn_coco-258fb6c6.pth```
+* MobileNetV2 weights(下载后重命名为`mobilenet_v2.pth`，然后放到`bakcbone`文件夹下): https://download.pytorch.org/models/mobilenet_v2-b0353104.pth
+* Resnet50 weights(下载后重命名为`resnet50.pth`，然后放到`bakcbone`文件夹下): https://download.pytorch.org/models/resnet50-0676ba61.pth
+* ResNet50+FPN weights: https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
+* 注意，下载的预训练权重记得要重命名，比如在train_resnet50_fpn.py中读取的是`fasterrcnn_resnet50_fpn_coco.pth`文件，
+  不是`fasterrcnn_resnet50_fpn_coco-258fb6c6.pth`，然后放到当前项目根目录下即可。
  
  
 ## 数据集，本例程使用的是PASCAL VOC2012数据集
@@ -42,16 +43,17 @@
 * 确保提前下载好对应预训练模型权重
 * 若要训练mobilenetv2+fasterrcnn，直接使用train_mobilenet.py训练脚本
 * 若要训练resnet50+fpn+fasterrcnn，直接使用train_resnet50_fpn.py训练脚本
-* 若要使用多GPU训练，使用```python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py```指令,```nproc_per_node```参数为使用GPU数量
-* 如果想指定使用哪些GPU设备可在指令前加上```CUDA_VISIBLE_DEVICES=0,3```(例如我只要使用设备中的第1块和第4块GPU设备)
-* ```CUDA_VISIBLE_DEVICES=0,3 python -m torch.distributed.launch --nproc_per_node=2 --use_env train_multi_GPU.py```
+* 若要使用多GPU训练，使用`python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py`指令,`nproc_per_node`参数为使用GPU数量
+* 如果想指定使用哪些GPU设备可在指令前加上`CUDA_VISIBLE_DEVICES=0,3`(例如我只要使用设备中的第1块和第4块GPU设备)
+* `CUDA_VISIBLE_DEVICES=0,3 python -m torch.distributed.launch --nproc_per_node=2 --use_env train_multi_GPU.py`
 
 ## 注意事项
-* 在使用训练脚本时，注意要将'--data-path'(VOC_root)设置为自己存放'VOCdevkit'文件夹所在的**根目录**
+* 在使用训练脚本时，注意要将`--data-path`(VOC_root)设置为自己存放`VOCdevkit`文件夹所在的**根目录**
 * 由于带有FPN结构的Faster RCNN很吃显存，如果GPU的显存不够(如果batch_size小于8的话)建议在create_model函数中使用默认的norm_layer，
   即不传递norm_layer变量，默认去使用FrozenBatchNorm2d(即不会去更新参数的bn层),使用中发现效果也很好。
-* 在使用预测脚本时，要将'train_weights'设置为你自己生成的权重路径。
-* 使用validation文件时，注意确保你的验证集或者测试集中必须包含每个类别的目标，并且使用时只需要修改'--num-classes'、'--data-path'和'--weights'即可，其他代码尽量不要改动
+* 训练过程中保存的`results.txt`是每个epoch在验证集上的COCO指标，前12个值是COCO指标，后面两个值是训练平均损失以及学习率
+* 在使用预测脚本时，要将`train_weights`设置为你自己生成的权重路径。
+* 使用validation文件时，注意确保你的验证集或者测试集中必须包含每个类别的目标，并且使用时只需要修改`--num-classes`、`--data-path`和`--weights-path`即可，其他代码尽量不要改动
 
 ## 如果对Faster RCNN原理不是很理解可参考我的bilibili
 * https://b23.tv/sXcBSP
diff --git a/pytorch_object_detection/faster_rcnn/backbone/__init__.py b/pytorch_object_detection/faster_rcnn/backbone/__init__.py
index f7559da86..1cedf7584 100644
--- a/pytorch_object_detection/faster_rcnn/backbone/__init__.py
+++ b/pytorch_object_detection/faster_rcnn/backbone/__init__.py
@@ -1,3 +1,4 @@
 from .resnet50_fpn_model import resnet50_fpn_backbone
 from .mobilenetv2_model import MobileNetV2
 from .vgg_model import vgg
+from .feature_pyramid_network import LastLevelMaxPool, BackboneWithFPN
diff --git a/pytorch_object_detection/faster_rcnn/backbone/feature_pyramid_network.py b/pytorch_object_detection/faster_rcnn/backbone/feature_pyramid_network.py
index 636829fc0..450960985 100644
--- a/pytorch_object_detection/faster_rcnn/backbone/feature_pyramid_network.py
+++ b/pytorch_object_detection/faster_rcnn/backbone/feature_pyramid_network.py
@@ -8,6 +8,59 @@
 from torch.jit.annotations import Tuple, List, Dict
 
 
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    Arguments:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+    """
+    __annotations__ = {
+        "return_layers": Dict[str, str],
+    }
+
+    def __init__(self, model, return_layers):
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+
+        # 遍历模型子模块按顺序存入有序字典
+        # 只保存layer4及其之前的结构，舍去之后不用的结构
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        # 依次遍历模型的所有子模块，并进行正向传播，
+        # 收集layer1, layer2, layer3, layer4的输出
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+
+
 class FeaturePyramidNetwork(nn.Module):
     """
     Module that adds a FPN from on top of a set of feature maps. This is based on
@@ -27,7 +80,7 @@ class FeaturePyramidNetwork(nn.Module):
     """
 
     def __init__(self, in_channels_list, out_channels, extra_blocks=None):
-        super(FeaturePyramidNetwork, self).__init__()
+        super().__init__()
         # 用来调整resnet特征矩阵(layer1,2,3,4)的channel（kernel_size=1）
         self.inner_blocks = nn.ModuleList()
         # 对调整后的特征矩阵使用3x3的卷积核来得到对应的预测特征矩阵
@@ -48,8 +101,7 @@ def __init__(self, in_channels_list, out_channels, extra_blocks=None):
 
         self.extra_blocks = extra_blocks
 
-    def get_result_from_inner_blocks(self, x, idx):
-        # type: (Tensor, int) -> Tensor
+    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
         """
         This is equivalent to self.inner_blocks[idx](x),
         but torchscript doesn't support this yet
@@ -65,8 +117,7 @@ def get_result_from_inner_blocks(self, x, idx):
             i += 1
         return out
 
-    def get_result_from_layer_blocks(self, x, idx):
-        # type: (Tensor, int) -> Tensor
+    def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
         """
         This is equivalent to self.layer_blocks[idx](x),
         but torchscript doesn't support this yet
@@ -82,8 +133,7 @@ def get_result_from_layer_blocks(self, x, idx):
             i += 1
         return out
 
-    def forward(self, x):
-        # type: (Dict[str, Tensor]) -> Dict[str, Tensor]
+    def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """
         Computes the FPN for a set of feature maps.
         Arguments:
@@ -127,8 +177,59 @@ class LastLevelMaxPool(torch.nn.Module):
     Applies a max_pool2d on top of the last feature map
     """
 
-    def forward(self, x, y, names):
-        # type: (List[Tensor], List[Tensor], List[str]) -> Tuple[List[Tensor], List[str]]
+    def forward(self, x: List[Tensor], y: List[Tensor], names: List[str]) -> Tuple[List[Tensor], List[str]]:
         names.append("pool")
         x.append(F.max_pool2d(x[-1], 1, 2, 0))  # input, kernel_size, stride, padding
         return x, names
+
+
+class BackboneWithFPN(nn.Module):
+    """
+    Adds a FPN on top of a model.
+    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
+    extract a submodel that returns the feature maps specified in return_layers.
+    The same limitations of IntermediatLayerGetter apply here.
+    Arguments:
+        backbone (nn.Module)
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+        in_channels_list (List[int]): number of channels for each feature map
+            that is returned, in the order they are present in the OrderedDict
+        out_channels (int): number of channels in the FPN.
+        extra_blocks: ExtraFPNBlock
+    Attributes:
+        out_channels (int): the number of channels in the FPN
+    """
+
+    def __init__(self,
+                 backbone: nn.Module,
+                 return_layers=None,
+                 in_channels_list=None,
+                 out_channels=256,
+                 extra_blocks=None,
+                 re_getter=True):
+        super().__init__()
+
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        if re_getter is True:
+            assert return_layers is not None
+            self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        else:
+            self.body = backbone
+
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list=in_channels_list,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+        )
+
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        x = self.body(x)
+        x = self.fpn(x)
+        return x
diff --git a/pytorch_object_detection/faster_rcnn/backbone/resnet50_fpn_model.py b/pytorch_object_detection/faster_rcnn/backbone/resnet50_fpn_model.py
index 8c796cfac..b15930765 100644
--- a/pytorch_object_detection/faster_rcnn/backbone/resnet50_fpn_model.py
+++ b/pytorch_object_detection/faster_rcnn/backbone/resnet50_fpn_model.py
@@ -1,19 +1,17 @@
 import os
-from collections import OrderedDict
 
 import torch
 import torch.nn as nn
-from torch.jit.annotations import List, Dict
 from torchvision.ops.misc import FrozenBatchNorm2d
 
-from .feature_pyramid_network import FeaturePyramidNetwork, LastLevelMaxPool
+from .feature_pyramid_network import BackboneWithFPN, LastLevelMaxPool
 
 
 class Bottleneck(nn.Module):
     expansion = 4
 
     def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None):
-        super(Bottleneck, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
 
@@ -56,7 +54,7 @@ def forward(self, x):
 class ResNet(nn.Module):
 
     def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None):
-        super(ResNet, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         self._norm_layer = norm_layer
@@ -136,100 +134,6 @@ def overwrite_eps(model, eps):
             module.eps = eps
 
 
-class IntermediateLayerGetter(nn.ModuleDict):
-    """
-    Module wrapper that returns intermediate layers from a model
-    It has a strong assumption that the modules have been registered
-    into the model in the same order as they are used.
-    This means that one should **not** reuse the same nn.Module
-    twice in the forward if you want this to work.
-    Additionally, it is only able to query submodules that are directly
-    assigned to the model. So if `model` is passed, `model.feature1` can
-    be returned, but not `model.feature1.layer2`.
-    Arguments:
-        model (nn.Module): model on which we will extract the features
-        return_layers (Dict[name, new_name]): a dict containing the names
-            of the modules for which the activations will be returned as
-            the key of the dict, and the value of the dict is the name
-            of the returned activation (which the user can specify).
-    """
-    __annotations__ = {
-        "return_layers": Dict[str, str],
-    }
-
-    def __init__(self, model, return_layers):
-        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
-            raise ValueError("return_layers are not present in model")
-
-        orig_return_layers = return_layers
-        return_layers = {str(k): str(v) for k, v in return_layers.items()}
-        layers = OrderedDict()
-
-        # 遍历模型子模块按顺序存入有序字典
-        # 只保存layer4及其之前的结构，舍去之后不用的结构
-        for name, module in model.named_children():
-            layers[name] = module
-            if name in return_layers:
-                del return_layers[name]
-            if not return_layers:
-                break
-
-        super(IntermediateLayerGetter, self).__init__(layers)
-        self.return_layers = orig_return_layers
-
-    def forward(self, x):
-        out = OrderedDict()
-        # 依次遍历模型的所有子模块，并进行正向传播，
-        # 收集layer1, layer2, layer3, layer4的输出
-        for name, module in self.items():
-            x = module(x)
-            if name in self.return_layers:
-                out_name = self.return_layers[name]
-                out[out_name] = x
-        return out
-
-
-class BackboneWithFPN(nn.Module):
-    """
-    Adds a FPN on top of a model.
-    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
-    extract a submodel that returns the feature maps specified in return_layers.
-    The same limitations of IntermediatLayerGetter apply here.
-    Arguments:
-        backbone (nn.Module)
-        return_layers (Dict[name, new_name]): a dict containing the names
-            of the modules for which the activations will be returned as
-            the key of the dict, and the value of the dict is the name
-            of the returned activation (which the user can specify).
-        in_channels_list (List[int]): number of channels for each feature map
-            that is returned, in the order they are present in the OrderedDict
-        out_channels (int): number of channels in the FPN.
-        extra_blocks: ExtraFPNBlock
-    Attributes:
-        out_channels (int): the number of channels in the FPN
-    """
-
-    def __init__(self, backbone, return_layers, in_channels_list, out_channels, extra_blocks=None):
-        super(BackboneWithFPN, self).__init__()
-
-        if extra_blocks is None:
-            extra_blocks = LastLevelMaxPool()
-
-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
-        self.fpn = FeaturePyramidNetwork(
-            in_channels_list=in_channels_list,
-            out_channels=out_channels,
-            extra_blocks=extra_blocks,
-        )
-
-        self.out_channels = out_channels
-
-    def forward(self, x):
-        x = self.body(x)
-        x = self.fpn(x)
-        return x
-
-
 def resnet50_fpn_backbone(pretrain_path="",
                           norm_layer=FrozenBatchNorm2d,  # FrozenBatchNorm2d的功能与BatchNorm2d类似，但参数无法更新
                           trainable_layers=3,
diff --git a/pytorch_object_detection/faster_rcnn/change_backbone_with_fpn.py b/pytorch_object_detection/faster_rcnn/change_backbone_with_fpn.py
new file mode 100644
index 000000000..4ee20fd44
--- /dev/null
+++ b/pytorch_object_detection/faster_rcnn/change_backbone_with_fpn.py
@@ -0,0 +1,255 @@
+import os
+import datetime
+
+import torch
+
+import transforms
+from network_files import FasterRCNN, AnchorsGenerator
+from my_dataset import VOCDataSet
+from train_utils import GroupedBatchSampler, create_aspect_ratio_groups
+from train_utils import train_eval_utils as utils
+from backbone import BackboneWithFPN, LastLevelMaxPool
+
+
+def create_model(num_classes):
+    import torchvision
+    from torchvision.models.feature_extraction import create_feature_extractor
+
+    # --- mobilenet_v3_large fpn backbone --- #
+    backbone = torchvision.models.mobilenet_v3_large(pretrained=True)
+    # print(backbone)
+    return_layers = {"features.6": "0",   # stride 8
+                     "features.12": "1",  # stride 16
+                     "features.16": "2"}  # stride 32
+    # 提供给fpn的每个特征层channel
+    in_channels_list = [40, 112, 960]
+    new_backbone = create_feature_extractor(backbone, return_layers)
+    # img = torch.randn(1, 3, 224, 224)
+    # outputs = new_backbone(img)
+    # [print(f"{k} shape: {v.shape}") for k, v in outputs.items()]
+
+    # --- efficientnet_b0 fpn backbone --- #
+    # backbone = torchvision.models.efficientnet_b0(pretrained=True)
+    # # print(backbone)
+    # return_layers = {"features.3": "0",  # stride 8
+    #                  "features.4": "1",  # stride 16
+    #                  "features.8": "2"}  # stride 32
+    # # 提供给fpn的每个特征层channel
+    # in_channels_list = [40, 80, 1280]
+    # new_backbone = create_feature_extractor(backbone, return_layers)
+    # # img = torch.randn(1, 3, 224, 224)
+    # # outputs = new_backbone(img)
+    # # [print(f"{k} shape: {v.shape}") for k, v in outputs.items()]
+
+    backbone_with_fpn = BackboneWithFPN(new_backbone,
+                                        return_layers=return_layers,
+                                        in_channels_list=in_channels_list,
+                                        out_channels=256,
+                                        extra_blocks=LastLevelMaxPool(),
+                                        re_getter=False)
+
+    anchor_sizes = ((64,), (128,), (256,), (512,))
+    aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+    anchor_generator = AnchorsGenerator(sizes=anchor_sizes,
+                                        aspect_ratios=aspect_ratios)
+
+    roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0', '1', '2'],  # 在哪些特征层上进行RoIAlign pooling
+                                                    output_size=[7, 7],  # RoIAlign pooling输出特征矩阵尺寸
+                                                    sampling_ratio=2)  # 采样率
+
+    model = FasterRCNN(backbone=backbone_with_fpn,
+                       num_classes=num_classes,
+                       rpn_anchor_generator=anchor_generator,
+                       box_roi_pool=roi_pooler)
+
+    return model
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    print("Using {} device training.".format(device.type))
+
+    # 用来保存coco_info的文件
+    results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
+
+    data_transform = {
+        "train": transforms.Compose([transforms.ToTensor(),
+                                     transforms.RandomHorizontalFlip(0.5)]),
+        "val": transforms.Compose([transforms.ToTensor()])
+    }
+
+    VOC_root = args.data_path
+    # check voc root
+    if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False:
+        raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root))
+
+    # load train data set
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt
+    train_dataset = VOCDataSet(VOC_root, "2012", data_transform["train"], "train.txt")
+    train_sampler = None
+
+    # 是否按图片相似高宽比采样图片组成batch
+    # 使用的话能够减小训练时所需GPU显存，默认使用
+    if args.aspect_ratio_group_factor >= 0:
+        train_sampler = torch.utils.data.RandomSampler(train_dataset)
+        # 统计所有图像高宽比例在bins区间中的位置索引
+        group_ids = create_aspect_ratio_groups(train_dataset, k=args.aspect_ratio_group_factor)
+        # 每个batch图片从同一高宽比例区间中取
+        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
+
+    # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using %g dataloader workers' % nw)
+    if train_sampler:
+        # 如果按照图片高宽比采样图片，dataloader中需要使用batch_sampler
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_sampler=train_batch_sampler,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+    else:
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_size=batch_size,
+                                                        shuffle=True,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+
+    # load validation data set
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
+    val_dataset = VOCDataSet(VOC_root, "2012", data_transform["val"], "val.txt")
+    val_data_set_loader = torch.utils.data.DataLoader(val_dataset,
+                                                      batch_size=1,
+                                                      shuffle=False,
+                                                      pin_memory=True,
+                                                      num_workers=nw,
+                                                      collate_fn=val_dataset.collate_fn)
+
+    # create model num_classes equal background + 20 classes
+    model = create_model(num_classes=args.num_classes + 1)
+    # print(model)
+
+    model.to(device)
+
+    # define optimizer
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.SGD(params,
+                                lr=args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    # learning rate scheduler
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
+                                                   step_size=3,
+                                                   gamma=0.33)
+
+    # 如果指定了上次训练保存的权重文件地址，则接着上次结果接着训练
+    if args.resume != "":
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp and "scaler" in checkpoint:
+            scaler.load_state_dict(checkpoint["scaler"])
+        print("the training process from epoch{}...".format(args.start_epoch))
+
+    train_loss = []
+    learning_rate = []
+    val_map = []
+
+    for epoch in range(args.start_epoch, args.epochs):
+        # train for one epoch, printing every 10 iterations
+        mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
+                                              device=device, epoch=epoch,
+                                              print_freq=50, warmup=True,
+                                              scaler=scaler)
+        train_loss.append(mean_loss.item())
+        learning_rate.append(lr)
+
+        # update the learning rate
+        lr_scheduler.step()
+
+        # evaluate on the test dataset
+        coco_info = utils.evaluate(model, val_data_set_loader, device=device)
+
+        # write into txt
+        with open(results_file, "a") as f:
+            # 写入的数据包括coco指标还有loss和learning rate
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+            f.write(txt + "\n")
+
+        val_map.append(coco_info[1])  # pascal mAP
+
+        # save weights
+        save_files = {
+            'model': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'lr_scheduler': lr_scheduler.state_dict(),
+            'epoch': epoch}
+        if args.amp:
+            save_files["scaler"] = scaler.state_dict()
+        torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch))
+
+    # plot loss and lr curve
+    if len(train_loss) != 0 and len(learning_rate) != 0:
+        from plot_curve import plot_loss_and_lr
+        plot_loss_and_lr(train_loss, learning_rate)
+
+    # plot mAP curve
+    if len(val_map) != 0:
+        from plot_curve import plot_map
+        plot_map(val_map)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 训练设备类型
+    parser.add_argument('--device', default='cuda:0', help='device')
+    # 训练数据集的根目录(VOCdevkit)
+    parser.add_argument('--data-path', default='./', help='dataset')
+    # 检测目标类别数(不包含背景)
+    parser.add_argument('--num-classes', default=20, type=int, help='num_classes')
+    # 文件保存地址
+    parser.add_argument('--output-dir', default='./save_weights', help='path where to save')
+    # 若需要接着上次训练，则指定上次训练保存权重文件地址
+    parser.add_argument('--resume', default='', type=str, help='resume from checkpoint')
+    # 指定接着从哪个epoch数开始训练
+    parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
+    # 训练的总epoch数
+    parser.add_argument('--epochs', default=15, type=int, metavar='N',
+                        help='number of total epochs to run')
+    # 学习率
+    parser.add_argument('--lr', default=0.005, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                             'on 8 gpus and 2 images_per_gpu')
+    # SGD的momentum参数
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    # SGD的weight_decay参数
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    # 训练的batch size
+    parser.add_argument('--batch_size', default=4, type=int, metavar='N',
+                        help='batch size when training.')
+    parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
+    # 是否使用混合精度训练(需要GPU支持混合精度)
+    parser.add_argument("--amp", default=False, help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+    print(args)
+
+    # 检查保存权重文件夹是否存在，不存在则创建
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    main(args)
diff --git a/pytorch_object_detection/faster_rcnn/change_backbone_without_fpn.py b/pytorch_object_detection/faster_rcnn/change_backbone_without_fpn.py
new file mode 100644
index 000000000..f4c9e5938
--- /dev/null
+++ b/pytorch_object_detection/faster_rcnn/change_backbone_without_fpn.py
@@ -0,0 +1,243 @@
+import os
+import datetime
+
+import torch
+
+import transforms
+from network_files import FasterRCNN, AnchorsGenerator
+from my_dataset import VOCDataSet
+from train_utils import GroupedBatchSampler, create_aspect_ratio_groups
+from train_utils import train_eval_utils as utils
+
+
+def create_model(num_classes):
+    import torchvision
+    from torchvision.models.feature_extraction import create_feature_extractor
+
+    # vgg16
+    backbone = torchvision.models.vgg16_bn(pretrained=True)
+    # print(backbone)
+    backbone = create_feature_extractor(backbone, return_nodes={"features.42": "0"})
+    # out = backbone(torch.rand(1, 3, 224, 224))
+    # print(out["0"].shape)
+    backbone.out_channels = 512
+
+    # resnet50 backbone
+    # backbone = torchvision.models.resnet50(pretrained=True)
+    # # print(backbone)
+    # backbone = create_feature_extractor(backbone, return_nodes={"layer3": "0"})
+    # # out = backbone(torch.rand(1, 3, 224, 224))
+    # # print(out["0"].shape)
+    # backbone.out_channels = 1024
+
+    # EfficientNetB0
+    # backbone = torchvision.models.efficientnet_b0(pretrained=True)
+    # # print(backbone)
+    # backbone = create_feature_extractor(backbone, return_nodes={"features.5": "0"})
+    # # out = backbone(torch.rand(1, 3, 224, 224))
+    # # print(out["0"].shape)
+    # backbone.out_channels = 112
+
+    anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),),
+                                        aspect_ratios=((0.5, 1.0, 2.0),))
+
+    roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],  # 在哪些特征层上进行RoIAlign pooling
+                                                    output_size=[7, 7],  # RoIAlign pooling输出特征矩阵尺寸
+                                                    sampling_ratio=2)  # 采样率
+
+    model = FasterRCNN(backbone=backbone,
+                       num_classes=num_classes,
+                       rpn_anchor_generator=anchor_generator,
+                       box_roi_pool=roi_pooler)
+
+    return model
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    print("Using {} device training.".format(device.type))
+
+    # 用来保存coco_info的文件
+    results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
+
+    data_transform = {
+        "train": transforms.Compose([transforms.ToTensor(),
+                                     transforms.RandomHorizontalFlip(0.5)]),
+        "val": transforms.Compose([transforms.ToTensor()])
+    }
+
+    VOC_root = args.data_path
+    # check voc root
+    if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False:
+        raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root))
+
+    # load train data set
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt
+    train_dataset = VOCDataSet(VOC_root, "2012", data_transform["train"], "train.txt")
+    train_sampler = None
+
+    # 是否按图片相似高宽比采样图片组成batch
+    # 使用的话能够减小训练时所需GPU显存，默认使用
+    if args.aspect_ratio_group_factor >= 0:
+        train_sampler = torch.utils.data.RandomSampler(train_dataset)
+        # 统计所有图像高宽比例在bins区间中的位置索引
+        group_ids = create_aspect_ratio_groups(train_dataset, k=args.aspect_ratio_group_factor)
+        # 每个batch图片从同一高宽比例区间中取
+        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
+
+    # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using %g dataloader workers' % nw)
+    if train_sampler:
+        # 如果按照图片高宽比采样图片，dataloader中需要使用batch_sampler
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_sampler=train_batch_sampler,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+    else:
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_size=batch_size,
+                                                        shuffle=True,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+
+    # load validation data set
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
+    val_dataset = VOCDataSet(VOC_root, "2012", data_transform["val"], "val.txt")
+    val_data_set_loader = torch.utils.data.DataLoader(val_dataset,
+                                                      batch_size=1,
+                                                      shuffle=False,
+                                                      pin_memory=True,
+                                                      num_workers=nw,
+                                                      collate_fn=val_dataset.collate_fn)
+
+    # create model num_classes equal background + 20 classes
+    model = create_model(num_classes=args.num_classes + 1)
+    # print(model)
+
+    model.to(device)
+
+    # define optimizer
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.SGD(params,
+                                lr=args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    # learning rate scheduler
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
+                                                   step_size=3,
+                                                   gamma=0.33)
+
+    # 如果指定了上次训练保存的权重文件地址，则接着上次结果接着训练
+    if args.resume != "":
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp and "scaler" in checkpoint:
+            scaler.load_state_dict(checkpoint["scaler"])
+        print("the training process from epoch{}...".format(args.start_epoch))
+
+    train_loss = []
+    learning_rate = []
+    val_map = []
+
+    for epoch in range(args.start_epoch, args.epochs):
+        # train for one epoch, printing every 10 iterations
+        mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
+                                              device=device, epoch=epoch,
+                                              print_freq=50, warmup=True,
+                                              scaler=scaler)
+        train_loss.append(mean_loss.item())
+        learning_rate.append(lr)
+
+        # update the learning rate
+        lr_scheduler.step()
+
+        # evaluate on the test dataset
+        coco_info = utils.evaluate(model, val_data_set_loader, device=device)
+
+        # write into txt
+        with open(results_file, "a") as f:
+            # 写入的数据包括coco指标还有loss和learning rate
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+            f.write(txt + "\n")
+
+        val_map.append(coco_info[1])  # pascal mAP
+
+        # save weights
+        save_files = {
+            'model': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'lr_scheduler': lr_scheduler.state_dict(),
+            'epoch': epoch}
+        if args.amp:
+            save_files["scaler"] = scaler.state_dict()
+        torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch))
+
+    # plot loss and lr curve
+    if len(train_loss) != 0 and len(learning_rate) != 0:
+        from plot_curve import plot_loss_and_lr
+        plot_loss_and_lr(train_loss, learning_rate)
+
+    # plot mAP curve
+    if len(val_map) != 0:
+        from plot_curve import plot_map
+        plot_map(val_map)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 训练设备类型
+    parser.add_argument('--device', default='cuda:0', help='device')
+    # 训练数据集的根目录(VOCdevkit)
+    parser.add_argument('--data-path', default='./', help='dataset')
+    # 检测目标类别数(不包含背景)
+    parser.add_argument('--num-classes', default=20, type=int, help='num_classes')
+    # 文件保存地址
+    parser.add_argument('--output-dir', default='./save_weights', help='path where to save')
+    # 若需要接着上次训练，则指定上次训练保存权重文件地址
+    parser.add_argument('--resume', default='', type=str, help='resume from checkpoint')
+    # 指定接着从哪个epoch数开始训练
+    parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
+    # 训练的总epoch数
+    parser.add_argument('--epochs', default=15, type=int, metavar='N',
+                        help='number of total epochs to run')
+    # 学习率
+    parser.add_argument('--lr', default=0.005, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                             'on 8 gpus and 2 images_per_gpu')
+    # SGD的momentum参数
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    # SGD的weight_decay参数
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    # 训练的batch size
+    parser.add_argument('--batch_size', default=4, type=int, metavar='N',
+                        help='batch size when training.')
+    parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
+    # 是否使用混合精度训练(需要GPU支持混合精度)
+    parser.add_argument("--amp", default=False, help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+    print(args)
+
+    # 检查保存权重文件夹是否存在，不存在则创建
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    main(args)
diff --git a/pytorch_object_detection/faster_rcnn/draw_box_utils.py b/pytorch_object_detection/faster_rcnn/draw_box_utils.py
index 1a2926583..835d7f7c1 100644
--- a/pytorch_object_detection/faster_rcnn/draw_box_utils.py
+++ b/pytorch_object_detection/faster_rcnn/draw_box_utils.py
@@ -1,6 +1,7 @@
-import collections
+from PIL.Image import Image, fromarray
 import PIL.ImageDraw as ImageDraw
 import PIL.ImageFont as ImageFont
+from PIL import ImageColor
 import numpy as np
 
 STANDARD_COLORS = [
@@ -30,66 +31,123 @@
 ]
 
 
-def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):
-    for i in range(boxes.shape[0]):
-        if scores[i] > thresh:
-            box = tuple(boxes[i].tolist())  # numpy -> list -> tuple
-            if classes[i] in category_index.keys():
-                class_name = category_index[classes[i]]
-            else:
-                class_name = 'N/A'
-            display_str = str(class_name)
-            display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))
-            box_to_display_str_map[box].append(display_str)
-            box_to_color_map[box] = STANDARD_COLORS[
-                classes[i] % len(STANDARD_COLORS)]
-        else:
-            break  # 网络输出概率已经排序过，当遇到一个不满足后面的肯定不满足
-
-
-def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):
+def draw_text(draw,
+              box: list,
+              cls: int,
+              score: float,
+              category_index: dict,
+              color: str,
+              font: str = 'arial.ttf',
+              font_size: int = 24):
+    """
+    将目标边界框和类别信息绘制到图片上
+    """
     try:
-        font = ImageFont.truetype('arial.ttf', 24)
+        font = ImageFont.truetype(font, font_size)
     except IOError:
         font = ImageFont.load_default()
 
+    left, top, right, bottom = box
     # If the total height of the display strings added to the top of the bounding
     # box exceeds the top of the image, stack the strings below the bounding box
     # instead of above.
-    display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]
+    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
+    display_str_heights = [font.getsize(ds)[1] for ds in display_str]
     # Each display_str has a top and bottom margin of 0.05x.
-    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
+    display_str_height = (1 + 2 * 0.05) * max(display_str_heights)
 
-    if top > total_display_str_height:
+    if top > display_str_height:
+        text_top = top - display_str_height
         text_bottom = top
     else:
-        text_bottom = bottom + total_display_str_height
-    # Reverse list and print from bottom to top.
-    for display_str in box_to_display_str_map[box][::-1]:
-        text_width, text_height = font.getsize(display_str)
-        margin = np.ceil(0.05 * text_height)
-        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
-                        (left + text_width, text_bottom)], fill=color)
-        draw.text((left + margin, text_bottom - text_height - margin),
-                  display_str,
+        text_top = bottom
+        text_bottom = bottom + display_str_height
+
+    for ds in display_str:
+        text_width, text_height = font.getsize(ds)
+        margin = np.ceil(0.05 * text_width)
+        draw.rectangle([(left, text_top),
+                        (left + text_width + 2 * margin, text_bottom)], fill=color)
+        draw.text((left + margin, text_top),
+                  ds,
                   fill='black',
                   font=font)
-        text_bottom -= text_height - 2 * margin
+        left += text_width
+
+
+def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
+    np_image = np.array(image)
+    masks = np.where(masks > thresh, True, False)
+
+    # colors = np.array(colors)
+    img_to_draw = np.copy(np_image)
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors):
+        img_to_draw[mask] = color
+
+    out = np_image * (1 - alpha) + img_to_draw * alpha
+    return fromarray(out.astype(np.uint8))
+
+
+def draw_objs(image: Image,
+              boxes: np.ndarray = None,
+              classes: np.ndarray = None,
+              scores: np.ndarray = None,
+              masks: np.ndarray = None,
+              category_index: dict = None,
+              box_thresh: float = 0.1,
+              mask_thresh: float = 0.5,
+              line_thickness: int = 8,
+              font: str = 'arial.ttf',
+              font_size: int = 24,
+              draw_boxes_on_image: bool = True,
+              draw_masks_on_image: bool = False):
+    """
+    将目标边界框信息，类别信息，mask信息绘制在图片上
+    Args:
+        image: 需要绘制的图片
+        boxes: 目标边界框信息
+        classes: 目标类别信息
+        scores: 目标概率信息
+        masks: 目标mask信息
+        category_index: 类别与名称字典
+        box_thresh: 过滤的概率阈值
+        mask_thresh:
+        line_thickness: 边界框宽度
+        font: 字体类型
+        font_size: 字体大小
+        draw_boxes_on_image:
+        draw_masks_on_image:
+
+    Returns:
+
+    """
+
+    # 过滤掉低概率的目标
+    idxs = np.greater(scores, box_thresh)
+    boxes = boxes[idxs]
+    classes = classes[idxs]
+    scores = scores[idxs]
+    if masks is not None:
+        masks = masks[idxs]
+    if len(boxes) == 0:
+        return image
 
+    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
 
-def draw_box(image, boxes, classes, scores, category_index, thresh=0.5, line_thickness=8):
-    box_to_display_str_map = collections.defaultdict(list)
-    box_to_color_map = collections.defaultdict(str)
+    if draw_boxes_on_image:
+        # Draw all boxes onto image.
+        draw = ImageDraw.Draw(image)
+        for box, cls, score, color in zip(boxes, classes, scores, colors):
+            left, top, right, bottom = box
+            # 绘制目标边界框
+            draw.line([(left, top), (left, bottom), (right, bottom),
+                       (right, top), (left, top)], width=line_thickness, fill=color)
+            # 绘制类别和概率信息
+            draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
 
-    filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
+    if draw_masks_on_image and (masks is not None):
+        # Draw all mask onto image.
+        image = draw_masks(image, masks, colors, mask_thresh)
 
-    # Draw all boxes onto image.
-    draw = ImageDraw.Draw(image)
-    im_width, im_height = image.size
-    for box, color in box_to_color_map.items():
-        xmin, ymin, xmax, ymax = box
-        (left, right, top, bottom) = (xmin * 1, xmax * 1,
-                                      ymin * 1, ymax * 1)
-        draw.line([(left, top), (left, bottom), (right, bottom),
-                   (right, top), (left, top)], width=line_thickness, fill=color)
-        draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
+    return image
diff --git a/pytorch_object_detection/faster_rcnn/my_dataset.py b/pytorch_object_detection/faster_rcnn/my_dataset.py
index 23986bdf5..efabd862e 100644
--- a/pytorch_object_detection/faster_rcnn/my_dataset.py
+++ b/pytorch_object_detection/faster_rcnn/my_dataset.py
@@ -1,3 +1,4 @@
+import numpy as np
 from torch.utils.data import Dataset
 import os
 import torch
@@ -11,7 +12,11 @@ class VOCDataSet(Dataset):
 
     def __init__(self, voc_root, year="2012", transforms=None, txt_name: str = "train.txt"):
         assert year in ["2007", "2012"], "year must be in ['2007', '2012']"
-        self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
+        # 增加容错能力
+        if "VOCdevkit" in voc_root:
+            self.root = os.path.join(voc_root, f"VOC{year}")
+        else:
+            self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
         self.img_root = os.path.join(self.root, "JPEGImages")
         self.annotations_root = os.path.join(self.root, "Annotations")
 
@@ -20,20 +25,34 @@ def __init__(self, voc_root, year="2012", transforms=None, txt_name: str = "trai
         assert os.path.exists(txt_path), "not found {} file.".format(txt_name)
 
         with open(txt_path) as read:
-            self.xml_list = [os.path.join(self.annotations_root, line.strip() + ".xml")
-                             for line in read.readlines() if len(line.strip()) > 0]
+            xml_list = [os.path.join(self.annotations_root, line.strip() + ".xml")
+                        for line in read.readlines() if len(line.strip()) > 0]
 
+        self.xml_list = []
         # check file
+        for xml_path in xml_list:
+            if os.path.exists(xml_path) is False:
+                print(f"Warning: not found '{xml_path}', skip this annotation file.")
+                continue
+
+            # check for targets
+            with open(xml_path) as fid:
+                xml_str = fid.read()
+            xml = etree.fromstring(xml_str)
+            data = self.parse_xml_to_dict(xml)["annotation"]
+            if "object" not in data:
+                print(f"INFO: no objects in {xml_path}, skip this annotation file.")
+                continue
+
+            self.xml_list.append(xml_path)
+
         assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path)
-        for xml_path in self.xml_list:
-            assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)
 
         # read class_indict
         json_file = './pascal_voc_classes.json'
         assert os.path.exists(json_file), "{} file not exist.".format(json_file)
-        json_file = open(json_file, 'r')
-        self.class_dict = json.load(json_file)
-        json_file.close()
+        with open(json_file, 'r') as f:
+            self.class_dict = json.load(f)
 
         self.transforms = transforms
 
@@ -181,7 +200,7 @@ def collate_fn(batch):
         return tuple(zip(*batch))
 
 # import transforms
-# from draw_box_utils import draw_box
+# from draw_box_utils import draw_objs
 # from PIL import Image
 # import json
 # import matplotlib.pyplot as plt
@@ -193,7 +212,7 @@ def collate_fn(batch):
 # try:
 #     json_file = open('./pascal_voc_classes.json', 'r')
 #     class_dict = json.load(json_file)
-#     category_index = {v: k for k, v in class_dict.items()}
+#     category_index = {str(v): str(k) for k, v in class_dict.items()}
 # except Exception as e:
 #     print(e)
 #     exit(-1)
@@ -210,12 +229,14 @@ def collate_fn(batch):
 # for index in random.sample(range(0, len(train_data_set)), k=5):
 #     img, target = train_data_set[index]
 #     img = ts.ToPILImage()(img)
-#     draw_box(img,
-#              target["boxes"].numpy(),
-#              target["labels"].numpy(),
-#              [1 for i in range(len(target["labels"].numpy()))],
-#              category_index,
-#              thresh=0.5,
-#              line_thickness=5)
-#     plt.imshow(img)
+#     plot_img = draw_objs(img,
+#                          target["boxes"].numpy(),
+#                          target["labels"].numpy(),
+#                          np.ones(target["labels"].shape[0]),
+#                          category_index=category_index,
+#                          box_thresh=0.5,
+#                          line_thickness=3,
+#                          font='arial.ttf',
+#                          font_size=20)
+#     plt.imshow(plot_img)
 #     plt.show()
diff --git a/pytorch_object_detection/faster_rcnn/network_files/boxes.py b/pytorch_object_detection/faster_rcnn/network_files/boxes.py
index f720df1f8..8eeca4573 100644
--- a/pytorch_object_detection/faster_rcnn/network_files/boxes.py
+++ b/pytorch_object_detection/faster_rcnn/network_files/boxes.py
@@ -23,7 +23,7 @@ def nms(boxes, scores, iou_threshold):
         scores for each one of the boxes
     iou_threshold : float
         discards all overlapping
-        boxes with IoU < iou_threshold
+        boxes with IoU > iou_threshold
 
     Returns
     -------
diff --git a/pytorch_object_detection/faster_rcnn/predict.py b/pytorch_object_detection/faster_rcnn/predict.py
index 35ad35dd2..2b85400be 100644
--- a/pytorch_object_detection/faster_rcnn/predict.py
+++ b/pytorch_object_detection/faster_rcnn/predict.py
@@ -10,7 +10,7 @@
 from torchvision import transforms
 from network_files import FasterRCNN, FastRCNNPredictor, AnchorsGenerator
 from backbone import resnet50_fpn_backbone, MobileNetV2
-from draw_box_utils import draw_box
+from draw_box_utils import draw_objs
 
 
 def create_model(num_classes):
@@ -52,18 +52,20 @@ def main():
     model = create_model(num_classes=21)
 
     # load train weights
-    train_weights = "./save_weights/model.pth"
-    assert os.path.exists(train_weights), "{} file dose not exist.".format(train_weights)
-    model.load_state_dict(torch.load(train_weights, map_location=device)["model"])
+    weights_path = "./save_weights/model.pth"
+    assert os.path.exists(weights_path), "{} file dose not exist.".format(weights_path)
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     model.to(device)
 
     # read class_indict
     label_json_path = './pascal_voc_classes.json'
     assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
-    json_file = open(label_json_path, 'r')
-    class_dict = json.load(json_file)
-    json_file.close()
-    category_index = {v: k for k, v in class_dict.items()}
+    with open(label_json_path, 'r') as f:
+        class_dict = json.load(f)
+
+    category_index = {str(v): str(k) for k, v in class_dict.items()}
 
     # load image
     original_img = Image.open("./test.jpg")
@@ -93,19 +95,20 @@ def main():
         if len(predict_boxes) == 0:
             print("没有检测到任何目标!")
 
-        draw_box(original_img,
-                 predict_boxes,
-                 predict_classes,
-                 predict_scores,
-                 category_index,
-                 thresh=0.5,
-                 line_thickness=3)
-        plt.imshow(original_img)
+        plot_img = draw_objs(original_img,
+                             predict_boxes,
+                             predict_classes,
+                             predict_scores,
+                             category_index=category_index,
+                             box_thresh=0.5,
+                             line_thickness=3,
+                             font='arial.ttf',
+                             font_size=20)
+        plt.imshow(plot_img)
         plt.show()
         # 保存预测的图片结果
-        original_img.save("test_result.jpg")
+        plot_img.save("test_result.jpg")
 
 
 if __name__ == '__main__':
     main()
-
diff --git a/pytorch_object_detection/faster_rcnn/train_mobilenetv2.py b/pytorch_object_detection/faster_rcnn/train_mobilenetv2.py
index eab4fd4ec..bcbd28f8e 100644
--- a/pytorch_object_detection/faster_rcnn/train_mobilenetv2.py
+++ b/pytorch_object_detection/faster_rcnn/train_mobilenetv2.py
@@ -146,7 +146,7 @@ def main():
         # write into txt
         with open(results_file, "a") as f:
             # 写入的数据包括coco指标还有loss和learning rate
-            result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
             txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
             f.write(txt + "\n")
 
@@ -193,7 +193,7 @@ def main():
         # write into txt
         with open(results_file, "a") as f:
             # 写入的数据包括coco指标还有loss和learning rate
-            result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
             txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
             f.write(txt + "\n")
 
diff --git a/pytorch_object_detection/faster_rcnn/train_multi_GPU.py b/pytorch_object_detection/faster_rcnn/train_multi_GPU.py
index 3315bc760..1ec76c076 100644
--- a/pytorch_object_detection/faster_rcnn/train_multi_GPU.py
+++ b/pytorch_object_detection/faster_rcnn/train_multi_GPU.py
@@ -96,6 +96,9 @@ def main(args):
     model = create_model(num_classes=args.num_classes + 1)
     model.to(device)
 
+    if args.distributed and args.sync_bn:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
     model_without_ddp = model
     if args.distributed:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
@@ -154,7 +157,7 @@ def main(args):
             # write into txt
             with open(results_file, "a") as f:
                 # 写入的数据包括coco指标还有loss和learning rate
-                result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
+                result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
                 txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
                 f.write(txt + "\n")
 
@@ -246,6 +249,7 @@ def main(args):
     parser.add_argument('--world-size', default=4, type=int,
                         help='number of distributed processes')
     parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+    parser.add_argument("--sync-bn", dest="sync_bn", help="Use sync batch norm", type=bool, default=False)
     # 是否使用混合精度训练(需要GPU支持混合精度)
     parser.add_argument("--amp", default=False, help="Use torch.cuda.amp for mixed precision training")
 
diff --git a/pytorch_object_detection/faster_rcnn/train_res50_fpn.py b/pytorch_object_detection/faster_rcnn/train_res50_fpn.py
index b45c4897e..f45e62901 100644
--- a/pytorch_object_detection/faster_rcnn/train_res50_fpn.py
+++ b/pytorch_object_detection/faster_rcnn/train_res50_fpn.py
@@ -11,22 +11,26 @@
 from train_utils import train_eval_utils as utils
 
 
-def create_model(num_classes):
+def create_model(num_classes, load_pretrain_weights=True):
     # 注意，这里的backbone默认使用的是FrozenBatchNorm2d，即不会去更新bn参数
     # 目的是为了防止batch_size太小导致效果更差(如果显存很小，建议使用默认的FrozenBatchNorm2d)
     # 如果GPU显存很大可以设置比较大的batch_size就可以将norm_layer设置为普通的BatchNorm2d
     # trainable_layers包括['layer4', 'layer3', 'layer2', 'layer1', 'conv1']， 5代表全部训练
-    backbone = resnet50_fpn_backbone(norm_layer=torch.nn.BatchNorm2d,
+    # resnet50 imagenet weights url: https://download.pytorch.org/models/resnet50-0676ba61.pth
+    backbone = resnet50_fpn_backbone(pretrain_path="./backbone/resnet50.pth",
+                                     norm_layer=torch.nn.BatchNorm2d,
                                      trainable_layers=3)
     # 训练自己数据集时不要修改这里的91，修改的是传入的num_classes参数
     model = FasterRCNN(backbone=backbone, num_classes=91)
-    # 载入预训练模型权重
-    # https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
-    weights_dict = torch.load("./backbone/fasterrcnn_resnet50_fpn_coco.pth", map_location='cpu')
-    missing_keys, unexpected_keys = model.load_state_dict(weights_dict, strict=False)
-    if len(missing_keys) != 0 or len(unexpected_keys) != 0:
-        print("missing_keys: ", missing_keys)
-        print("unexpected_keys: ", unexpected_keys)
+    
+    if load_pretrain_weights:
+        # 载入预训练模型权重
+        # https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
+        weights_dict = torch.load("./backbone/fasterrcnn_resnet50_fpn_coco.pth", map_location='cpu')
+        missing_keys, unexpected_keys = model.load_state_dict(weights_dict, strict=False)
+        if len(missing_keys) != 0 or len(unexpected_keys) != 0:
+            print("missing_keys: ", missing_keys)
+            print("unexpected_keys: ", unexpected_keys)
 
     # get number of input features for the classifier
     in_features = model.roi_heads.box_predictor.cls_score.in_features
@@ -36,8 +40,8 @@ def create_model(num_classes):
     return model
 
 
-def main(parser_data):
-    device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu")
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
     print("Using {} device training.".format(device.type))
 
     # 用来保存coco_info的文件
@@ -49,7 +53,7 @@ def main(parser_data):
         "val": transforms.Compose([transforms.ToTensor()])
     }
 
-    VOC_root = parser_data.data_path
+    VOC_root = args.data_path
     # check voc root
     if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False:
         raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root))
@@ -69,7 +73,7 @@ def main(parser_data):
         train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
 
     # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
-    batch_size = parser_data.batch_size
+    batch_size = args.batch_size
     nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
     print('Using %g dataloader workers' % nw)
     if train_sampler:
@@ -98,15 +102,17 @@ def main(parser_data):
                                                       collate_fn=val_dataset.collate_fn)
 
     # create model num_classes equal background + 20 classes
-    model = create_model(num_classes=parser_data.num_classes + 1)
+    model = create_model(num_classes=args.num_classes + 1)
     # print(model)
 
     model.to(device)
 
     # define optimizer
     params = [p for p in model.parameters() if p.requires_grad]
-    optimizer = torch.optim.SGD(params, lr=0.005,
-                                momentum=0.9, weight_decay=0.0005)
+    optimizer = torch.optim.SGD(params,
+                                lr=args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
 
     scaler = torch.cuda.amp.GradScaler() if args.amp else None
 
@@ -116,21 +122,21 @@ def main(parser_data):
                                                    gamma=0.33)
 
     # 如果指定了上次训练保存的权重文件地址，则接着上次结果接着训练
-    if parser_data.resume != "":
-        checkpoint = torch.load(parser_data.resume, map_location='cpu')
+    if args.resume != "":
+        checkpoint = torch.load(args.resume, map_location='cpu')
         model.load_state_dict(checkpoint['model'])
         optimizer.load_state_dict(checkpoint['optimizer'])
         lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
-        parser_data.start_epoch = checkpoint['epoch'] + 1
+        args.start_epoch = checkpoint['epoch'] + 1
         if args.amp and "scaler" in checkpoint:
             scaler.load_state_dict(checkpoint["scaler"])
-        print("the training process from epoch{}...".format(parser_data.start_epoch))
+        print("the training process from epoch{}...".format(args.start_epoch))
 
     train_loss = []
     learning_rate = []
     val_map = []
 
-    for epoch in range(parser_data.start_epoch, parser_data.epochs):
+    for epoch in range(args.start_epoch, args.epochs):
         # train for one epoch, printing every 10 iterations
         mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
                                               device=device, epoch=epoch,
@@ -148,7 +154,7 @@ def main(parser_data):
         # write into txt
         with open(results_file, "a") as f:
             # 写入的数据包括coco指标还有loss和learning rate
-            result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
             txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
             f.write(txt + "\n")
 
@@ -196,6 +202,17 @@ def main(parser_data):
     # 训练的总epoch数
     parser.add_argument('--epochs', default=15, type=int, metavar='N',
                         help='number of total epochs to run')
+    # 学习率
+    parser.add_argument('--lr', default=0.01, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                             'on 8 gpus and 2 images_per_gpu')
+    # SGD的momentum参数
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    # SGD的weight_decay参数
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
     # 训练的batch size
     parser.add_argument('--batch_size', default=8, type=int, metavar='N',
                         help='batch size when training.')
diff --git a/pytorch_object_detection/faster_rcnn/validation.py b/pytorch_object_detection/faster_rcnn/validation.py
index 95b3ba696..d353aed4e 100644
--- a/pytorch_object_detection/faster_rcnn/validation.py
+++ b/pytorch_object_detection/faster_rcnn/validation.py
@@ -100,9 +100,9 @@ def main(parser_data):
     # read class_indict
     label_json_path = './pascal_voc_classes.json'
     assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
-    json_file = open(label_json_path, 'r')
-    class_dict = json.load(json_file)
-    json_file.close()
+    with open(label_json_path, 'r') as f:
+        class_dict = json.load(f)
+
     category_index = {v: k for k, v in class_dict.items()}
 
     VOC_root = parser_data.data_path
@@ -130,9 +130,11 @@ def main(parser_data):
     model = FasterRCNN(backbone=backbone, num_classes=parser_data.num_classes + 1)
 
     # 载入你自己训练好的模型权重
-    weights_path = parser_data.weights
+    weights_path = parser_data.weights_path
     assert os.path.exists(weights_path), "not found {} file.".format(weights_path)
-    model.load_state_dict(torch.load(weights_path, map_location=device)['model'])
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     # print(model)
 
     model.to(device)
@@ -201,7 +203,7 @@ def main(parser_data):
     parser.add_argument('--data-path', default='/data/', help='dataset root')
 
     # 训练好的权重文件
-    parser.add_argument('--weights', default='./save_weights/model.pth', type=str, help='training weights')
+    parser.add_argument('--weights-path', default='./save_weights/model.pth', type=str, help='training weights')
 
     # batch size
     parser.add_argument('--batch_size', default=1, type=int, metavar='N',
diff --git a/pytorch_object_detection/mask_rcnn/README.md b/pytorch_object_detection/mask_rcnn/README.md
new file mode 100644
index 000000000..77f014021
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/README.md
@@ -0,0 +1,153 @@
+# Mask R-CNN
+
+## 该项目参考自pytorch官方torchvision模块中的源码(使用pycocotools处略有不同)
+* https://github.com/pytorch/vision/tree/master/references/detection
+
+## 环境配置：
+* Python3.6/3.7/3.8
+* Pytorch1.10或以上
+* pycocotools(Linux:`pip install pycocotools`; Windows:`pip install pycocotools-windows`(不需要额外安装vs))
+* Ubuntu或Centos(不建议Windows)
+* 最好使用GPU训练
+* 详细环境配置见`requirements.txt`
+
+## 文件结构：
+```
+  ├── backbone: 特征提取网络
+  ├── network_files: Mask R-CNN网络
+  ├── train_utils: 训练验证相关模块（包括coco验证相关）
+  ├── my_dataset_coco.py: 自定义dataset用于读取COCO2017数据集
+  ├── my_dataset_voc.py: 自定义dataset用于读取Pascal VOC数据集
+  ├── train.py: 单GPU/CPU训练脚本
+  ├── train_multi_GPU.py: 针对使用多GPU的用户使用
+  ├── predict.py: 简易的预测脚本，使用训练好的权重进行预测
+  ├── validation.py: 利用训练好的权重验证/测试数据的COCO指标，并生成record_mAP.txt文件
+  └── transforms.py: 数据预处理（随机水平翻转图像以及bboxes、将PIL图像转为Tensor）
+```
+
+## 预训练权重下载地址（下载后放入当前文件夹中）：
+* Resnet50预训练权重 https://download.pytorch.org/models/resnet50-0676ba61.pth (注意，下载预训练权重后要重命名，
+比如在train.py中读取的是`resnet50.pth`文件，不是`resnet50-0676ba61.pth`)
+* Mask R-CNN(Resnet50+FPN)预训练权重 https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth (注意，
+载预训练权重后要重命名，比如在train.py中读取的是`maskrcnn_resnet50_fpn_coco.pth`文件，不是`maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth`)
+ 
+ 
+## 数据集，本例程使用的有COCO2017数据集和Pascal VOC2012数据集
+### COCO2017数据集
+* COCO官网地址：https://cocodataset.org/
+* 对数据集不了解的可以看下我写的博文：https://blog.csdn.net/qq_37541097/article/details/113247318
+* 这里以下载coco2017数据集为例，主要下载三个文件：
+    * `2017 Train images [118K/18GB]`：训练过程中使用到的所有图像文件
+    * `2017 Val images [5K/1GB]`：验证过程中使用到的所有图像文件
+    * `2017 Train/Val annotations [241MB]`：对应训练集和验证集的标注json文件
+* 都解压到`coco2017`文件夹下，可得到如下文件夹结构：
+```
+├── coco2017: 数据集根目录
+     ├── train2017: 所有训练图像文件夹(118287张)
+     ├── val2017: 所有验证图像文件夹(5000张)
+     └── annotations: 对应标注文件夹
+              ├── instances_train2017.json: 对应目标检测、分割任务的训练集标注文件
+              ├── instances_val2017.json: 对应目标检测、分割任务的验证集标注文件
+              ├── captions_train2017.json: 对应图像描述的训练集标注文件
+              ├── captions_val2017.json: 对应图像描述的验证集标注文件
+              ├── person_keypoints_train2017.json: 对应人体关键点检测的训练集标注文件
+              └── person_keypoints_val2017.json: 对应人体关键点检测的验证集标注文件夹
+```
+
+### Pascal VOC2012数据集
+* 数据集下载地址： http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html#devkit
+* 对数据集不了解的可以看下我写的博文：https://blog.csdn.net/qq_37541097/article/details/115787033
+* 解压后得到的文件夹结构如下：
+```
+VOCdevkit
+    └── VOC2012
+         ├── Annotations               所有的图像标注信息(XML文件)
+         ├── ImageSets
+         │   ├── Action                人的行为动作图像信息
+         │   ├── Layout                人的各个部位图像信息
+         │   │
+         │   ├── Main                  目标检测分类图像信息
+         │   │     ├── train.txt       训练集(5717)
+         │   │     ├── val.txt         验证集(5823)
+         │   │     └── trainval.txt    训练集+验证集(11540)
+         │   │
+         │   └── Segmentation          目标分割图像信息
+         │         ├── train.txt       训练集(1464)
+         │         ├── val.txt         验证集(1449)
+         │         └── trainval.txt    训练集+验证集(2913)
+         │
+         ├── JPEGImages                所有图像文件
+         ├── SegmentationClass         语义分割png图（基于类别）
+         └── SegmentationObject        实例分割png图（基于目标）
+```
+
+## 训练方法
+* 确保提前准备好数据集
+* 确保提前下载好对应预训练模型权重
+* 确保设置好`--num-classes`和`--data-path`
+* 若要使用单GPU训练直接使用train.py训练脚本
+* 若要使用多GPU训练，使用`torchrun --nproc_per_node=8 train_multi_GPU.py`指令,`nproc_per_node`参数为使用GPU数量
+* 如果想指定使用哪些GPU设备可在指令前加上`CUDA_VISIBLE_DEVICES=0,3`(例如我只要使用设备中的第1块和第4块GPU设备)
+* `CUDA_VISIBLE_DEVICES=0,3 torchrun --nproc_per_node=2 train_multi_GPU.py`
+
+## 注意事项
+1. 在使用训练脚本时，注意要将`--data-path`设置为自己存放数据集的**根目录**：
+```
+# 假设要使用COCO数据集，启用自定义数据集读取CocoDetection并将数据集解压到成/data/coco2017目录下
+python train.py --data-path /data/coco2017
+
+# 假设要使用Pascal VOC数据集，启用自定义数据集读取VOCInstances并数据集解压到成/data/VOCdevkit目录下
+python train.py --data-path /data/VOCdevkit
+```
+
+2. 如果倍增`batch_size`，建议学习率也跟着倍增。假设将`batch_size`从4设置成8，那么学习率`lr`从0.004设置成0.008
+3. 如果使用Batch Normalization模块时，`batch_size`不能小于4，否则效果会变差。**如果显存不够，batch_size必须小于4时**，建议在创建`resnet50_fpn_backbone`时，
+将`norm_layer`设置成`FrozenBatchNorm2d`或将`trainable_layers`设置成0(即冻结整个`backbone`)
+4. 训练过程中保存的`det_results.txt`(目标检测任务)以及`seg_results.txt`(实例分割任务)是每个epoch在验证集上的COCO指标，前12个值是COCO指标，后面两个值是训练平均损失以及学习率
+5. 在使用预测脚本时，要将`weights_path`设置为你自己生成的权重路径。
+6. 使用validation文件时，注意确保你的验证集或者测试集中必须包含每个类别的目标，并且使用时需要修改`--num-classes`、`--data-path`、`--weights-path`以及
+`--label-json-path`（该参数是根据训练的数据集设置的）。其他代码尽量不要改动
+
+
+## 复现结果
+在COCO2017数据集上进行复现，训练过程中仅载入Resnet50的预训练权重，训练26个epochs。训练采用指令如下：
+```
+torchrun --nproc_per_node=8 train_multi_GPU.py --batch-size 8 --lr 0.08 --pretrain False --amp True
+```
+
+训练得到权重下载地址： https://pan.baidu.com/s/1qpXUIsvnj8RHY-V05J-mnA  密码: 63d5
+
+在COCO2017验证集上的mAP(目标检测任务)：
+```
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.381
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.588
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.411
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.215
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.420
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.492
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.315
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.499
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.523
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.319
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.565
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.666
+```
+
+在COCO2017验证集上的mAP(实例分割任务)：
+```
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.340
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.552
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.361
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.151
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.369
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.500
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.290
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.449
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.468
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.266
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.509
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.619
+```
+
+## 如果对Mask RCNN原理不是很理解可参考我的bilibili
+https://www.bilibili.com/video/BV1ZY411774T
diff --git a/pytorch_object_detection/mask_rcnn/backbone/__init__.py b/pytorch_object_detection/mask_rcnn/backbone/__init__.py
new file mode 100644
index 000000000..314eb748f
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/backbone/__init__.py
@@ -0,0 +1 @@
+from .resnet50_fpn_model import resnet50_fpn_backbone
diff --git a/pytorch_object_detection/mask_rcnn/backbone/feature_pyramid_network.py b/pytorch_object_detection/mask_rcnn/backbone/feature_pyramid_network.py
new file mode 100644
index 000000000..fc2fc757f
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/backbone/feature_pyramid_network.py
@@ -0,0 +1,235 @@
+from collections import OrderedDict
+
+import torch.nn as nn
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+
+from torch.jit.annotations import Tuple, List, Dict
+
+
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    Arguments:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+    """
+    __annotations__ = {
+        "return_layers": Dict[str, str],
+    }
+
+    def __init__(self, model, return_layers):
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+
+        # 遍历模型子模块按顺序存入有序字典
+        # 只保存layer4及其之前的结构，舍去之后不用的结构
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        # 依次遍历模型的所有子模块，并进行正向传播，
+        # 收集layer1, layer2, layer3, layer4的输出
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+
+
+class BackboneWithFPN(nn.Module):
+    """
+    Adds a FPN on top of a model.
+    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
+    extract a submodel that returns the feature maps specified in return_layers.
+    The same limitations of IntermediatLayerGetter apply here.
+    Arguments:
+        backbone (nn.Module)
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+        in_channels_list (List[int]): number of channels for each feature map
+            that is returned, in the order they are present in the OrderedDict
+        out_channels (int): number of channels in the FPN.
+        extra_blocks: ExtraFPNBlock
+    Attributes:
+        out_channels (int): the number of channels in the FPN
+    """
+
+    def __init__(self,
+                 backbone: nn.Module,
+                 return_layers=None,
+                 in_channels_list=None,
+                 out_channels=256,
+                 extra_blocks=None,
+                 re_getter=True):
+        super().__init__()
+
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        if re_getter:
+            assert return_layers is not None
+            self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        else:
+            self.body = backbone
+
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list=in_channels_list,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+        )
+
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        x = self.body(x)
+        x = self.fpn(x)
+        return x
+
+
+class FeaturePyramidNetwork(nn.Module):
+    """
+    Module that adds a FPN from on top of a set of feature maps. This is based on
+    `"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.
+    The feature maps are currently supposed to be in increasing depth
+    order.
+    The input to the model is expected to be an OrderedDict[Tensor], containing
+    the feature maps on top of which the FPN will be added.
+    Arguments:
+        in_channels_list (list[int]): number of channels for each feature map that
+            is passed to the module
+        out_channels (int): number of channels of the FPN representation
+        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
+            be performed. It is expected to take the fpn features, the original
+            features and the names of the original features as input, and returns
+            a new list of feature maps and their corresponding names
+    """
+
+    def __init__(self, in_channels_list, out_channels, extra_blocks=None):
+        super().__init__()
+        # 用来调整resnet特征矩阵(layer1,2,3,4)的channel（kernel_size=1）
+        self.inner_blocks = nn.ModuleList()
+        # 对调整后的特征矩阵使用3x3的卷积核来得到对应的预测特征矩阵
+        self.layer_blocks = nn.ModuleList()
+        for in_channels in in_channels_list:
+            if in_channels == 0:
+                continue
+            inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
+            layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
+            self.inner_blocks.append(inner_block_module)
+            self.layer_blocks.append(layer_block_module)
+
+        # initialize parameters now to avoid modifying the initialization of top_blocks
+        for m in self.children():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+        self.extra_blocks = extra_blocks
+
+    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.inner_blocks[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.inner_blocks)
+        if idx < 0:
+            idx += num_blocks
+        i = 0
+        out = x
+        for module in self.inner_blocks:
+            if i == idx:
+                out = module(x)
+            i += 1
+        return out
+
+    def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.layer_blocks[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.layer_blocks)
+        if idx < 0:
+            idx += num_blocks
+        i = 0
+        out = x
+        for module in self.layer_blocks:
+            if i == idx:
+                out = module(x)
+            i += 1
+        return out
+
+    def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Computes the FPN for a set of feature maps.
+        Arguments:
+            x (OrderedDict[Tensor]): feature maps for each feature level.
+        Returns:
+            results (OrderedDict[Tensor]): feature maps after FPN layers.
+                They are ordered from highest resolution first.
+        """
+        # unpack OrderedDict into two lists for easier handling
+        names = list(x.keys())
+        x = list(x.values())
+
+        # 将resnet layer4的channel调整到指定的out_channels
+        # last_inner = self.inner_blocks[-1](x[-1])
+        last_inner = self.get_result_from_inner_blocks(x[-1], -1)
+        # result中保存着每个预测特征层
+        results = []
+        # 将layer4调整channel后的特征矩阵，通过3x3卷积后得到对应的预测特征矩阵
+        # results.append(self.layer_blocks[-1](last_inner))
+        results.append(self.get_result_from_layer_blocks(last_inner, -1))
+
+        for idx in range(len(x) - 2, -1, -1):
+            inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
+            feat_shape = inner_lateral.shape[-2:]
+            inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
+            last_inner = inner_lateral + inner_top_down
+            results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
+
+        # 在layer4对应的预测特征层基础上生成预测特征矩阵5
+        if self.extra_blocks is not None:
+            results, names = self.extra_blocks(results, x, names)
+
+        # make it back an OrderedDict
+        out = OrderedDict([(k, v) for k, v in zip(names, results)])
+
+        return out
+
+
+class LastLevelMaxPool(torch.nn.Module):
+    """
+    Applies a max_pool2d on top of the last feature map
+    """
+
+    def forward(self, x: List[Tensor], y: List[Tensor], names: List[str]) -> Tuple[List[Tensor], List[str]]:
+        names.append("pool")
+        x.append(F.max_pool2d(x[-1], 1, 2, 0))
+        return x, names
diff --git a/pytorch_object_detection/mask_rcnn/backbone/resnet50_fpn_model.py b/pytorch_object_detection/mask_rcnn/backbone/resnet50_fpn_model.py
new file mode 100644
index 000000000..a79502e5b
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/backbone/resnet50_fpn_model.py
@@ -0,0 +1,199 @@
+import os
+
+import torch
+import torch.nn as nn
+from torchvision.ops.misc import FrozenBatchNorm2d
+
+from .feature_pyramid_network import BackboneWithFPN, LastLevelMaxPool
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None):
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
+                               kernel_size=1, stride=1, bias=False)  # squeeze channels
+        self.bn1 = norm_layer(out_channel)
+        # -----------------------------------------
+        self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
+                               kernel_size=3, stride=stride, bias=False, padding=1)
+        self.bn2 = norm_layer(out_channel)
+        # -----------------------------------------
+        self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel * self.expansion,
+                               kernel_size=1, stride=1, bias=False)  # unsqueeze channels
+        self.bn3 = norm_layer(out_channel * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    def forward(self, x):
+        identity = x
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None):
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.include_top = include_top
+        self.in_channel = 64
+
+        self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
+                               padding=3, bias=False)
+        self.bn1 = norm_layer(self.in_channel)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, blocks_num[0])
+        self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
+        if self.include_top:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))  # output size = (1, 1)
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+
+    def _make_layer(self, block, channel, block_num, stride=1):
+        norm_layer = self._norm_layer
+        downsample = None
+        if stride != 1 or self.in_channel != channel * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
+                norm_layer(channel * block.expansion))
+
+        layers = []
+        layers.append(block(self.in_channel, channel, downsample=downsample,
+                            stride=stride, norm_layer=norm_layer))
+        self.in_channel = channel * block.expansion
+
+        for _ in range(1, block_num):
+            layers.append(block(self.in_channel, channel, norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.include_top:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+
+        return x
+
+
+def overwrite_eps(model, eps):
+    """
+    This method overwrites the default eps values of all the
+    FrozenBatchNorm2d layers of the model with the provided value.
+    This is necessary to address the BC-breaking change introduced
+    by the bug-fix at pytorch/vision#2933. The overwrite is applied
+    only when the pretrained weights are loaded to maintain compatibility
+    with previous versions.
+
+    Args:
+        model (nn.Module): The model on which we perform the overwrite.
+        eps (float): The new value of eps.
+    """
+    for module in model.modules():
+        if isinstance(module, FrozenBatchNorm2d):
+            module.eps = eps
+
+
+def resnet50_fpn_backbone(pretrain_path="",
+                          norm_layer=nn.BatchNorm2d,
+                          trainable_layers=3,
+                          returned_layers=None,
+                          extra_blocks=None):
+    """
+    搭建resnet50_fpn——backbone
+    Args:
+        pretrain_path: resnet50的预训练权重，如果不使用就默认为空
+        norm_layer: 默认是nn.BatchNorm2d，如果GPU显存很小，batch_size不能设置很大，
+                    建议将norm_layer设置成FrozenBatchNorm2d(默认是nn.BatchNorm2d)
+                    (https://github.com/facebookresearch/maskrcnn-benchmark/issues/267)
+        trainable_layers: 指定训练哪些层结构
+        returned_layers: 指定哪些层的输出需要返回
+        extra_blocks: 在输出的特征层基础上额外添加的层结构
+
+    Returns:
+
+    """
+    resnet_backbone = ResNet(Bottleneck, [3, 4, 6, 3],
+                             include_top=False,
+                             norm_layer=norm_layer)
+
+    if isinstance(norm_layer, FrozenBatchNorm2d):
+        overwrite_eps(resnet_backbone, 0.0)
+
+    if pretrain_path != "":
+        assert os.path.exists(pretrain_path), "{} is not exist.".format(pretrain_path)
+        # 载入预训练权重
+        print(resnet_backbone.load_state_dict(torch.load(pretrain_path), strict=False))
+
+    # select layers that wont be frozen
+    assert 0 <= trainable_layers <= 5
+    layers_to_train = ['layer4', 'layer3', 'layer2', 'layer1', 'conv1'][:trainable_layers]
+
+    # 如果要训练所有层结构的话，不要忘了conv1后还有一个bn1
+    if trainable_layers == 5:
+        layers_to_train.append("bn1")
+
+    # freeze layers
+    for name, parameter in resnet_backbone.named_parameters():
+        # 只训练不在layers_to_train列表中的层结构
+        if all([not name.startswith(layer) for layer in layers_to_train]):
+            parameter.requires_grad_(False)
+
+    if extra_blocks is None:
+        extra_blocks = LastLevelMaxPool()
+
+    if returned_layers is None:
+        returned_layers = [1, 2, 3, 4]
+    # 返回的特征层个数肯定大于0小于5
+    assert min(returned_layers) > 0 and max(returned_layers) < 5
+
+    # return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}
+    return_layers = {f'layer{k}': str(v) for v, k in enumerate(returned_layers)}
+
+    # in_channel 为layer4的输出特征矩阵channel = 2048
+    in_channels_stage2 = resnet_backbone.in_channel // 8  # 256
+    # 记录resnet50提供给fpn的每个特征层channel
+    in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
+    # 通过fpn后得到的每个特征层的channel
+    out_channels = 256
+    return BackboneWithFPN(resnet_backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)
diff --git a/pytorch_object_detection/mask_rcnn/coco91_indices.json b/pytorch_object_detection/mask_rcnn/coco91_indices.json
new file mode 100644
index 000000000..decbe58ce
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/coco91_indices.json
@@ -0,0 +1,92 @@
+{
+    "1": "person",
+    "2": "bicycle",
+    "3": "car",
+    "4": "motorcycle",
+    "5": "airplane",
+    "6": "bus",
+    "7": "train",
+    "8": "truck",
+    "9": "boat",
+    "10": "traffic light",
+    "11": "fire hydrant",
+    "12": "N/A",
+    "13": "stop sign",
+    "14": "parking meter",
+    "15": "bench",
+    "16": "bird",
+    "17": "cat",
+    "18": "dog",
+    "19": "horse",
+    "20": "sheep",
+    "21": "cow",
+    "22": "elephant",
+    "23": "bear",
+    "24": "zebra",
+    "25": "giraffe",
+    "26": "N/A",
+    "27": "backpack",
+    "28": "umbrella",
+    "29": "N/A",
+    "30": "N/A",
+    "31": "handbag",
+    "32": "tie",
+    "33": "suitcase",
+    "34": "frisbee",
+    "35": "skis",
+    "36": "snowboard",
+    "37": "sports ball",
+    "38": "kite",
+    "39": "baseball bat",
+    "40": "baseball glove",
+    "41": "skateboard",
+    "42": "surfboard",
+    "43": "tennis racket",
+    "44": "bottle",
+    "45": "N/A",
+    "46": "wine glass",
+    "47": "cup",
+    "48": "fork",
+    "49": "knife",
+    "50": "spoon",
+    "51": "bowl",
+    "52": "banana",
+    "53": "apple",
+    "54": "sandwich",
+    "55": "orange",
+    "56": "broccoli",
+    "57": "carrot",
+    "58": "hot dog",
+    "59": "pizza",
+    "60": "donut",
+    "61": "cake",
+    "62": "chair",
+    "63": "couch",
+    "64": "potted plant",
+    "65": "bed",
+    "66": "N/A",
+    "67": "dining table",
+    "68": "N/A",
+    "69": "N/A",
+    "70": "toilet",
+    "71": "N/A",
+    "72": "tv",
+    "73": "laptop",
+    "74": "mouse",
+    "75": "remote",
+    "76": "keyboard",
+    "77": "cell phone",
+    "78": "microwave",
+    "79": "oven",
+    "80": "toaster",
+    "81": "sink",
+    "82": "refrigerator",
+    "83": "N/A",
+    "84": "book",
+    "85": "clock",
+    "86": "vase",
+    "87": "scissors",
+    "88": "teddy bear",
+    "89": "hair drier",
+    "90": "toothbrush"
+}
\ No newline at end of file
diff --git a/pytorch_object_detection/mask_rcnn/det_results20220406-141544.txt b/pytorch_object_detection/mask_rcnn/det_results20220406-141544.txt
new file mode 100644
index 000000000..28014527b
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/det_results20220406-141544.txt
@@ -0,0 +1,26 @@
+epoch:0 0.171  0.342  0.154  0.099  0.211  0.213  0.184  0.315  0.334  0.168  0.375  0.440  1.3826  0.08
+epoch:1 0.230  0.419  0.230  0.132  0.266  0.288  0.224  0.374  0.395  0.216  0.435  0.512  1.0356  0.08
+epoch:2 0.242  0.435  0.244  0.133  0.272  0.313  0.233  0.393  0.416  0.232  0.452  0.532  0.9718  0.08
+epoch:3 0.261  0.456  0.269  0.145  0.284  0.326  0.248  0.415  0.440  0.260  0.475  0.550  0.9363  0.08
+epoch:4 0.266  0.458  0.277  0.150  0.301  0.337  0.250  0.409  0.433  0.245  0.467  0.564  0.9145  0.08
+epoch:5 0.272  0.465  0.286  0.155  0.309  0.348  0.251  0.407  0.429  0.247  0.461  0.561  0.8982  0.08
+epoch:6 0.288  0.482  0.303  0.163  0.321  0.363  0.263  0.431  0.452  0.265  0.491  0.570  0.8859  0.08
+epoch:7 0.287  0.483  0.302  0.164  0.320  0.363  0.268  0.432  0.454  0.268  0.483  0.584  0.8771  0.08
+epoch:8 0.298  0.492  0.318  0.166  0.336  0.377  0.268  0.434  0.454  0.265  0.500  0.580  0.8685  0.08
+epoch:9 0.289  0.484  0.306  0.156  0.325  0.374  0.263  0.428  0.450  0.252  0.490  0.589  0.8612  0.08
+epoch:10 0.297  0.489  0.316  0.167  0.330  0.381  0.270  0.436  0.459  0.258  0.501  0.579  0.8547  0.08
+epoch:11 0.299  0.494  0.317  0.171  0.335  0.382  0.272  0.439  0.461  0.276  0.501  0.586  0.8498  0.08
+epoch:12 0.301  0.497  0.321  0.178  0.333  0.390  0.270  0.443  0.466  0.277  0.505  0.600  0.8461  0.08
+epoch:13 0.307  0.503  0.327  0.175  0.345  0.388  0.276  0.441  0.465  0.269  0.510  0.574  0.8409  0.08
+epoch:14 0.299  0.491  0.319  0.171  0.339  0.372  0.271  0.445  0.470  0.284  0.508  0.593  0.8355  0.08
+epoch:15 0.306  0.503  0.324  0.166  0.342  0.396  0.278  0.443  0.468  0.271  0.511  0.598  0.8330  0.08
+epoch:16 0.374  0.579  0.407  0.214  0.415  0.476  0.311  0.500  0.526  0.325  0.573  0.659  0.7421  0.008
+epoch:17 0.379  0.587  0.409  0.214  0.420  0.484  0.316  0.502  0.528  0.322  0.569  0.668  0.7157  0.008
+epoch:18 0.380  0.587  0.411  0.214  0.423  0.486  0.315  0.503  0.528  0.323  0.571  0.669  0.7016  0.008
+epoch:19 0.381  0.588  0.413  0.216  0.422  0.490  0.317  0.508  0.532  0.332  0.574  0.676  0.6897  0.008
+epoch:20 0.379  0.586  0.410  0.212  0.418  0.488  0.313  0.499  0.523  0.317  0.566  0.667  0.6802  0.008
+epoch:21 0.378  0.587  0.408  0.210  0.418  0.488  0.314  0.496  0.520  0.314  0.560  0.667  0.6708  0.008
+epoch:22 0.381  0.588  0.411  0.213  0.420  0.495  0.316  0.500  0.524  0.318  0.567  0.673  0.6497  0.0008
+epoch:23 0.381  0.588  0.411  0.215  0.420  0.492  0.315  0.499  0.523  0.319  0.565  0.666  0.6447  0.0008
+epoch:24 0.381  0.588  0.412  0.214  0.419  0.495  0.316  0.499  0.523  0.317  0.565  0.669  0.6421  0.0008
+epoch:25 0.380  0.585  0.411  0.214  0.419  0.494  0.314  0.498  0.522  0.316  0.566  0.664  0.6398  0.0008
diff --git a/pytorch_object_detection/mask_rcnn/draw_box_utils.py b/pytorch_object_detection/mask_rcnn/draw_box_utils.py
new file mode 100644
index 000000000..2d74c9529
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/draw_box_utils.py
@@ -0,0 +1,153 @@
+from PIL.Image import Image, fromarray
+import PIL.ImageDraw as ImageDraw
+import PIL.ImageFont as ImageFont
+from PIL import ImageColor
+import numpy as np
+
+STANDARD_COLORS = [
+    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
+    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
+    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
+    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
+    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
+    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
+    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
+    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
+    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
+    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
+    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
+    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
+    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
+    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
+    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
+    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
+    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
+    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
+    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
+    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
+    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
+    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
+    'WhiteSmoke', 'Yellow', 'YellowGreen'
+]
+
+
+def draw_text(draw,
+              box: list,
+              cls: int,
+              score: float,
+              category_index: dict,
+              color: str,
+              font: str = 'arial.ttf',
+              font_size: int = 24):
+    """
+    将目标边界框和类别信息绘制到图片上
+    """
+    try:
+        font = ImageFont.truetype(font, font_size)
+    except IOError:
+        font = ImageFont.load_default()
+
+    left, top, right, bottom = box
+    # If the total height of the display strings added to the top of the bounding
+    # box exceeds the top of the image, stack the strings below the bounding box
+    # instead of above.
+    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
+    display_str_heights = [font.getsize(ds)[1] for ds in display_str]
+    # Each display_str has a top and bottom margin of 0.05x.
+    display_str_height = (1 + 2 * 0.05) * max(display_str_heights)
+
+    if top > display_str_height:
+        text_top = top - display_str_height
+        text_bottom = top
+    else:
+        text_top = bottom
+        text_bottom = bottom + display_str_height
+
+    for ds in display_str:
+        text_width, text_height = font.getsize(ds)
+        margin = np.ceil(0.05 * text_width)
+        draw.rectangle([(left, text_top),
+                        (left + text_width + 2 * margin, text_bottom)], fill=color)
+        draw.text((left + margin, text_top),
+                  ds,
+                  fill='black',
+                  font=font)
+        left += text_width
+
+
+def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
+    np_image = np.array(image)
+    masks = np.where(masks > thresh, True, False)
+
+    # colors = np.array(colors)
+    img_to_draw = np.copy(np_image)
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors):
+        img_to_draw[mask] = color
+
+    out = np_image * (1 - alpha) + img_to_draw * alpha
+    return fromarray(out.astype(np.uint8))
+
+
+def draw_objs(image: Image,
+              boxes: np.ndarray = None,
+              classes: np.ndarray = None,
+              scores: np.ndarray = None,
+              masks: np.ndarray = None,
+              category_index: dict = None,
+              box_thresh: float = 0.1,
+              mask_thresh: float = 0.5,
+              line_thickness: int = 8,
+              font: str = 'arial.ttf',
+              font_size: int = 24,
+              draw_boxes_on_image: bool = True,
+              draw_masks_on_image: bool = True):
+    """
+    将目标边界框信息，类别信息，mask信息绘制在图片上
+    Args:
+        image: 需要绘制的图片
+        boxes: 目标边界框信息
+        classes: 目标类别信息
+        scores: 目标概率信息
+        masks: 目标mask信息
+        category_index: 类别与名称字典
+        box_thresh: 过滤的概率阈值
+        mask_thresh:
+        line_thickness: 边界框宽度
+        font: 字体类型
+        font_size: 字体大小
+        draw_boxes_on_image:
+        draw_masks_on_image:
+
+    Returns:
+
+    """
+
+    # 过滤掉低概率的目标
+    idxs = np.greater(scores, box_thresh)
+    boxes = boxes[idxs]
+    classes = classes[idxs]
+    scores = scores[idxs]
+    if masks is not None:
+        masks = masks[idxs]
+    if len(boxes) == 0:
+        return image
+
+    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
+
+    if draw_boxes_on_image:
+        # Draw all boxes onto image.
+        draw = ImageDraw.Draw(image)
+        for box, cls, score, color in zip(boxes, classes, scores, colors):
+            left, top, right, bottom = box
+            # 绘制目标边界框
+            draw.line([(left, top), (left, bottom), (right, bottom),
+                       (right, top), (left, top)], width=line_thickness, fill=color)
+            # 绘制类别和概率信息
+            draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
+
+    if draw_masks_on_image and (masks is not None):
+        # Draw all mask onto image.
+        image = draw_masks(image, masks, colors, mask_thresh)
+
+    return image
diff --git a/pytorch_object_detection/mask_rcnn/my_dataset_coco.py b/pytorch_object_detection/mask_rcnn/my_dataset_coco.py
new file mode 100644
index 000000000..6946e07e9
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/my_dataset_coco.py
@@ -0,0 +1,154 @@
+import os
+import json
+
+import torch
+from PIL import Image
+import torch.utils.data as data
+from pycocotools.coco import COCO
+from train_utils import coco_remove_images_without_annotations, convert_coco_poly_mask
+
+
+class CocoDetection(data.Dataset):
+    """`MS Coco Detection <https://cocodataset.org/>`_ Dataset.
+
+    Args:
+        root (string): Root directory where images are downloaded to.
+        dataset (string): train or val.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+
+    def __init__(self, root, dataset="train", transforms=None, years="2017"):
+        super(CocoDetection, self).__init__()
+        assert dataset in ["train", "val"], 'dataset must be in ["train", "val"]'
+        anno_file = f"instances_{dataset}{years}.json"
+        assert os.path.exists(root), "file '{}' does not exist.".format(root)
+        self.img_root = os.path.join(root, f"{dataset}{years}")
+        assert os.path.exists(self.img_root), "path '{}' does not exist.".format(self.img_root)
+        self.anno_path = os.path.join(root, "annotations", anno_file)
+        assert os.path.exists(self.anno_path), "file '{}' does not exist.".format(self.anno_path)
+
+        self.mode = dataset
+        self.transforms = transforms
+        self.coco = COCO(self.anno_path)
+
+        # 获取coco数据索引与类别名称的关系
+        # 注意在object80中的索引并不是连续的，虽然只有80个类别，但索引还是按照stuff91来排序的
+        data_classes = dict([(v["id"], v["name"]) for k, v in self.coco.cats.items()])
+        max_index = max(data_classes.keys())  # 90
+        # 将缺失的类别名称设置成N/A
+        coco_classes = {}
+        for k in range(1, max_index + 1):
+            if k in data_classes:
+                coco_classes[k] = data_classes[k]
+            else:
+                coco_classes[k] = "N/A"
+
+        if dataset == "train":
+            json_str = json.dumps(coco_classes, indent=4)
+            with open("coco91_indices.json", "w") as f:
+                f.write(json_str)
+
+        self.coco_classes = coco_classes
+
+        ids = list(sorted(self.coco.imgs.keys()))
+        if dataset == "train":
+            # 移除没有目标，或者目标面积非常小的数据
+            valid_ids = coco_remove_images_without_annotations(self.coco, ids)
+            self.ids = valid_ids
+        else:
+            self.ids = ids
+
+    def parse_targets(self,
+                      img_id: int,
+                      coco_targets: list,
+                      w: int = None,
+                      h: int = None):
+        assert w > 0
+        assert h > 0
+
+        # 只筛选出单个对象的情况
+        anno = [obj for obj in coco_targets if obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        # [xmin, ymin, w, h] -> [xmin, ymin, xmax, ymax]
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_mask(segmentations, h, w)
+
+        # 筛选出合法的目标，即x_max>x_min且y_max>y_min
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        area = area[keep]
+        iscrowd = iscrowd[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = torch.tensor([img_id])
+
+        # for conversion to coco api
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return target
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
+        """
+        coco = self.coco
+        img_id = self.ids[index]
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        coco_target = coco.loadAnns(ann_ids)
+
+        path = coco.loadImgs(img_id)[0]['file_name']
+        img = Image.open(os.path.join(self.img_root, path)).convert('RGB')
+
+        w, h = img.size
+        target = self.parse_targets(img_id, coco_target, w, h)
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.ids)
+
+    def get_height_and_width(self, index):
+        coco = self.coco
+        img_id = self.ids[index]
+
+        img_info = coco.loadImgs(img_id)[0]
+        w = img_info["width"]
+        h = img_info["height"]
+        return h, w
+
+    @staticmethod
+    def collate_fn(batch):
+        return tuple(zip(*batch))
+
+
+if __name__ == '__main__':
+    train = CocoDetection("/data/coco2017", dataset="train")
+    print(len(train))
+    t = train[0]
diff --git a/pytorch_object_detection/mask_rcnn/my_dataset_voc.py b/pytorch_object_detection/mask_rcnn/my_dataset_voc.py
new file mode 100644
index 000000000..2034b5ace
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/my_dataset_voc.py
@@ -0,0 +1,215 @@
+import os
+import json
+
+from lxml import etree
+import numpy as np
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+from train_utils import convert_to_coco_api
+
+
+class VOCInstances(Dataset):
+    def __init__(self, voc_root, year="2012", txt_name: str = "train.txt", transforms=None):
+        super().__init__()
+        if isinstance(year, int):
+            year = str(year)
+        assert year in ["2007", "2012"], "year must be in ['2007', '2012']"
+        if "VOCdevkit" in voc_root:
+            root = os.path.join(voc_root, f"VOC{year}")
+        else:
+            root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
+        assert os.path.exists(root), "path '{}' does not exist.".format(root)
+        image_dir = os.path.join(root, 'JPEGImages')
+        xml_dir = os.path.join(root, 'Annotations')
+        mask_dir = os.path.join(root, 'SegmentationObject')
+
+        txt_path = os.path.join(root, "ImageSets", "Segmentation", txt_name)
+        assert os.path.exists(txt_path), "file '{}' does not exist.".format(txt_path)
+        with open(os.path.join(txt_path), "r") as f:
+            file_names = [x.strip() for x in f.readlines() if len(x.strip()) > 0]
+
+        # read class_indict
+        json_file = 'pascal_voc_indices.json'
+        assert os.path.exists(json_file), "{} file not exist.".format(json_file)
+        with open(json_file, 'r') as f:
+            idx2classes = json.load(f)
+            self.class_dict = dict([(v, k) for k, v in idx2classes.items()])
+
+        self.images_path = []     # 存储图片路径
+        self.xmls_path = []       # 存储xml文件路径
+        self.xmls_info = []       # 存储解析的xml字典文件
+        self.masks_path = []      # 存储SegmentationObject图片路径
+        self.objects_bboxes = []  # 存储解析的目标boxes等信息
+        self.masks = []           # 存储读取的SegmentationObject图片信息
+
+        # 检查图片、xml文件以及mask是否都在
+        images_path = [os.path.join(image_dir, x + ".jpg") for x in file_names]
+        xmls_path = [os.path.join(xml_dir, x + '.xml') for x in file_names]
+        masks_path = [os.path.join(mask_dir, x + ".png") for x in file_names]
+        for idx, (img_path, xml_path, mask_path) in enumerate(zip(images_path, xmls_path, masks_path)):
+            assert os.path.exists(img_path), f"not find {img_path}"
+            assert os.path.exists(xml_path), f"not find {xml_path}"
+            assert os.path.exists(mask_path), f"not find {mask_path}"
+
+            # 解析xml中bbox信息
+            with open(xml_path) as fid:
+                xml_str = fid.read()
+            xml = etree.fromstring(xml_str)
+            obs_dict = parse_xml_to_dict(xml)["annotation"]  # 将xml文件解析成字典
+            obs_bboxes = parse_objects(obs_dict, xml_path, self.class_dict, idx)  # 解析出目标信息
+            num_objs = obs_bboxes["boxes"].shape[0]
+
+            # 读取SegmentationObject并检查是否和bboxes信息数量一致
+            instances_mask = Image.open(mask_path)
+            instances_mask = np.array(instances_mask)
+            instances_mask[instances_mask == 255] = 0  # 255为背景或者忽略掉的地方，这里为了方便直接设置为背景(0)
+
+            # 需要检查一下标注的bbox个数是否和instances个数一致
+            num_instances = instances_mask.max()
+            if num_objs != num_instances:
+                print(f"warning: num_boxes:{num_objs} and num_instances:{num_instances} do not correspond. "
+                      f"skip image:{img_path}")
+                continue
+
+            self.images_path.append(img_path)
+            self.xmls_path.append(xml_path)
+            self.xmls_info.append(obs_dict)
+            self.masks_path.append(mask_path)
+            self.objects_bboxes.append(obs_bboxes)
+            self.masks.append(instances_mask)
+
+        self.transforms = transforms
+        self.coco = convert_to_coco_api(self)
+
+    def parse_mask(self, idx: int):
+        mask = self.masks[idx]
+        c = mask.max()  # 有几个目标最大索引就等于几
+        masks = []
+        # 对每个目标的mask单独使用一个channel存放
+        for i in range(1, c+1):
+            masks.append(mask == i)
+        masks = np.stack(masks, axis=0)
+        return torch.as_tensor(masks, dtype=torch.uint8)
+
+    def __getitem__(self, idx):
+        """
+        Args:
+            idx (int): Index
+
+        Returns:
+            tuple: (image, target) where target is the image segmentation.
+        """
+        img = Image.open(self.images_path[idx]).convert('RGB')
+        target = self.objects_bboxes[idx]
+        masks = self.parse_mask(idx)
+        target["masks"] = masks
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.images_path)
+
+    def get_height_and_width(self, idx):
+        """方便统计所有图片的高宽比例信息"""
+        # read xml
+        data = self.xmls_info[idx]
+        data_height = int(data["size"]["height"])
+        data_width = int(data["size"]["width"])
+        return data_height, data_width
+
+    def get_annotations(self, idx):
+        """方便构建COCO()"""
+        data = self.xmls_info[idx]
+        h = int(data["size"]["height"])
+        w = int(data["size"]["width"])
+        target = self.objects_bboxes[idx]
+        masks = self.parse_mask(idx)
+        target["masks"] = masks
+        return target, h, w
+
+    @staticmethod
+    def collate_fn(batch):
+        return tuple(zip(*batch))
+
+
+def parse_xml_to_dict(xml):
+    """
+    将xml文件解析成字典形式，参考tensorflow的recursive_parse_xml_to_dict
+    Args:
+        xml: xml tree obtained by parsing XML file contents using lxml.etree
+
+    Returns:
+        Python dictionary holding XML contents.
+    """
+
+    if len(xml) == 0:  # 遍历到底层，直接返回tag对应的信息
+        return {xml.tag: xml.text}
+
+    result = {}
+    for child in xml:
+        child_result = parse_xml_to_dict(child)  # 递归遍历标签信息
+        if child.tag != 'object':
+            result[child.tag] = child_result[child.tag]
+        else:
+            if child.tag not in result:  # 因为object可能有多个，所以需要放入列表里
+                result[child.tag] = []
+            result[child.tag].append(child_result[child.tag])
+    return {xml.tag: result}
+
+
+def parse_objects(data: dict, xml_path: str, class_dict: dict, idx: int):
+    """
+    解析出bboxes、labels、iscrowd以及ares等信息
+    Args:
+        data: 将xml解析成dict的Annotation数据
+        xml_path: 对应xml的文件路径
+        class_dict: 类别与索引对应关系
+        idx: 图片对应的索引
+
+    Returns:
+
+    """
+    boxes = []
+    labels = []
+    iscrowd = []
+    assert "object" in data, "{} lack of object information.".format(xml_path)
+    for obj in data["object"]:
+        xmin = float(obj["bndbox"]["xmin"])
+        xmax = float(obj["bndbox"]["xmax"])
+        ymin = float(obj["bndbox"]["ymin"])
+        ymax = float(obj["bndbox"]["ymax"])
+
+        # 进一步检查数据，有的标注信息中可能有w或h为0的情况，这样的数据会导致计算回归loss为nan
+        if xmax <= xmin or ymax <= ymin:
+            print("Warning: in '{}' xml, there are some bbox w/h <=0".format(xml_path))
+            continue
+
+        boxes.append([xmin, ymin, xmax, ymax])
+        labels.append(int(class_dict[obj["name"]]))
+        if "difficult" in obj:
+            iscrowd.append(int(obj["difficult"]))
+        else:
+            iscrowd.append(0)
+
+    # convert everything into a torch.Tensor
+    boxes = torch.as_tensor(boxes, dtype=torch.float32)
+    labels = torch.as_tensor(labels, dtype=torch.int64)
+    iscrowd = torch.as_tensor(iscrowd, dtype=torch.int64)
+    image_id = torch.tensor([idx])
+    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
+
+    return {"boxes": boxes,
+            "labels": labels,
+            "iscrowd": iscrowd,
+            "image_id": image_id,
+            "area": area}
+
+
+if __name__ == '__main__':
+    dataset = VOCInstances(voc_root="/data/")
+    print(len(dataset))
+    d1 = dataset[0]
diff --git a/pytorch_object_detection/mask_rcnn/network_files/__init__.py b/pytorch_object_detection/mask_rcnn/network_files/__init__.py
new file mode 100644
index 000000000..3a2ed2299
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/__init__.py
@@ -0,0 +1,3 @@
+from .faster_rcnn_framework import FasterRCNN, FastRCNNPredictor
+from .rpn_function import AnchorsGenerator
+from .mask_rcnn import MaskRCNN
diff --git a/pytorch_object_detection/mask_rcnn/network_files/boxes.py b/pytorch_object_detection/mask_rcnn/network_files/boxes.py
new file mode 100644
index 000000000..8eeca4573
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/boxes.py
@@ -0,0 +1,181 @@
+import torch
+from typing import Tuple
+from torch import Tensor
+import torchvision
+
+
+def nms(boxes, scores, iou_threshold):
+    # type: (Tensor, Tensor, float) -> Tensor
+    """
+    Performs non-maximum suppression (NMS) on the boxes according
+    to their intersection-over-union (IoU).
+
+    NMS iteratively removes lower scoring boxes which have an
+    IoU greater than iou_threshold with another (higher scoring)
+    box.
+
+    Parameters
+    ----------
+    boxes : Tensor[N, 4])
+        boxes to perform NMS on. They
+        are expected to be in (x1, y1, x2, y2) format
+    scores : Tensor[N]
+        scores for each one of the boxes
+    iou_threshold : float
+        discards all overlapping
+        boxes with IoU > iou_threshold
+
+    Returns
+    -------
+    keep : Tensor
+        int64 tensor with the indices
+        of the elements that have been kept
+        by NMS, sorted in decreasing order of scores
+    """
+    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
+
+
+def batched_nms(boxes, scores, idxs, iou_threshold):
+    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    """
+    Performs non-maximum suppression in a batched fashion.
+
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+
+    Parameters
+    ----------
+    boxes : Tensor[N, 4]
+        boxes where NMS will be performed. They
+        are expected to be in (x1, y1, x2, y2) format
+    scores : Tensor[N]
+        scores for each one of the boxes
+    idxs : Tensor[N]
+        indices of the categories for each one of the boxes.
+    iou_threshold : float
+        discards all overlapping boxes
+        with IoU < iou_threshold
+
+    Returns
+    -------
+    keep : Tensor
+        int64 tensor with the indices of
+        the elements that have been kept by NMS, sorted
+        in decreasing order of scores
+    """
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+
+    # strategy: in order to perform NMS independently per class.
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+    # 获取所有boxes中最大的坐标值（xmin, ymin, xmax, ymax）
+    max_coordinate = boxes.max()
+
+    # to(): Performs Tensor dtype and/or device conversion
+    # 为每一个类别/每一层生成一个很大的偏移量
+    # 这里的to只是让生成tensor的dytpe和device与boxes保持一致
+    offsets = idxs.to(boxes) * (max_coordinate + 1)
+    # boxes加上对应层的偏移量后，保证不同类别/层之间boxes不会有重合的现象
+    boxes_for_nms = boxes + offsets[:, None]
+    keep = nms(boxes_for_nms, scores, iou_threshold)
+    return keep
+
+
+def remove_small_boxes(boxes, min_size):
+    # type: (Tensor, float) -> Tensor
+    """
+    Remove boxes which contains at least one side smaller than min_size.
+    移除宽高小于指定阈值的索引
+    Arguments:
+        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format
+        min_size (float): minimum size
+
+    Returns:
+        keep (Tensor[K]): indices of the boxes that have both sides
+            larger than min_size
+    """
+    ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]  # 预测boxes的宽和高
+    # keep = (ws >= min_size) & (hs >= min_size)  # 当满足宽，高都大于给定阈值时为True
+    keep = torch.logical_and(torch.ge(ws, min_size), torch.ge(hs, min_size))
+    # nonzero(): Returns a tensor containing the indices of all non-zero elements of input
+    # keep = keep.nonzero().squeeze(1)
+    keep = torch.where(keep)[0]
+    return keep
+
+
+def clip_boxes_to_image(boxes, size):
+    # type: (Tensor, Tuple[int, int]) -> Tensor
+    """
+    Clip boxes so that they lie inside an image of size `size`.
+    裁剪预测的boxes信息，将越界的坐标调整到图片边界上
+
+    Arguments:
+        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format
+        size (Tuple[height, width]): size of the image
+
+    Returns:
+        clipped_boxes (Tensor[N, 4])
+    """
+    dim = boxes.dim()
+    boxes_x = boxes[..., 0::2]  # x1, x2
+    boxes_y = boxes[..., 1::2]  # y1, y2
+    height, width = size
+
+    if torchvision._is_tracing():
+        boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
+        boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
+        boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
+        boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
+    else:
+        boxes_x = boxes_x.clamp(min=0, max=width)   # 限制x坐标范围在[0,width]之间
+        boxes_y = boxes_y.clamp(min=0, max=height)  # 限制y坐标范围在[0,height]之间
+
+    clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
+    return clipped_boxes.reshape(boxes.shape)
+
+
+def box_area(boxes):
+    """
+    Computes the area of a set of bounding boxes, which are specified by its
+    (x1, y1, x2, y2) coordinates.
+
+    Arguments:
+        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
+            are expected to be in (x1, y1, x2, y2) format
+
+    Returns:
+        area (Tensor[N]): area for each box
+    """
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def box_iou(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+
+    Arguments:
+        boxes1 (Tensor[N, 4])
+        boxes2 (Tensor[M, 4])
+
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    #  When the shapes do not match,
+    #  the shape of the returned output tensor follows the broadcasting rules
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # left-top [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # right-bottom [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    iou = inter / (area1[:, None] + area2 - inter)
+    return iou
+
diff --git a/pytorch_object_detection/mask_rcnn/network_files/det_utils.py b/pytorch_object_detection/mask_rcnn/network_files/det_utils.py
new file mode 100644
index 000000000..6b4fe6013
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/det_utils.py
@@ -0,0 +1,408 @@
+import torch
+import math
+from typing import List, Tuple
+from torch import Tensor
+
+
+class BalancedPositiveNegativeSampler(object):
+    """
+    This class samples batches, ensuring that they contain a fixed proportion of positives
+    """
+
+    def __init__(self, batch_size_per_image, positive_fraction):
+        # type: (int, float) -> None
+        """
+        Arguments:
+            batch_size_per_image (int): number of elements to be selected per image
+            positive_fraction (float): percentage of positive elements per batch
+        """
+        self.batch_size_per_image = batch_size_per_image
+        self.positive_fraction = positive_fraction
+
+    def __call__(self, matched_idxs):
+        # type: (List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
+        """
+        Arguments:
+            matched idxs: list of tensors containing -1, 0 or positive values.
+                Each tensor corresponds to a specific image.
+                -1 values are ignored, 0 are considered as negatives and > 0 as
+                positives.
+
+        Returns:
+            pos_idx (list[tensor])
+            neg_idx (list[tensor])
+
+        Returns two lists of binary masks for each image.
+        The first list contains the positive elements that were selected,
+        and the second list the negative example.
+        """
+        pos_idx = []
+        neg_idx = []
+        # 遍历每张图像的matched_idxs
+        for matched_idxs_per_image in matched_idxs:
+            # >= 1的为正样本, nonzero返回非零元素索引
+            # positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
+            positive = torch.where(torch.ge(matched_idxs_per_image, 1))[0]
+            # = 0的为负样本
+            # negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
+            negative = torch.where(torch.eq(matched_idxs_per_image, 0))[0]
+
+            # 指定正样本的数量
+            num_pos = int(self.batch_size_per_image * self.positive_fraction)
+            # protect against not enough positive examples
+            # 如果正样本数量不够就直接采用所有正样本
+            num_pos = min(positive.numel(), num_pos)
+            # 指定负样本数量
+            num_neg = self.batch_size_per_image - num_pos
+            # protect against not enough negative examples
+            # 如果负样本数量不够就直接采用所有负样本
+            num_neg = min(negative.numel(), num_neg)
+
+            # randomly select positive and negative examples
+            # Returns a random permutation of integers from 0 to n - 1.
+            # 随机选择指定数量的正负样本
+            perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+            perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+            pos_idx_per_image = positive[perm1]
+            neg_idx_per_image = negative[perm2]
+
+            # create binary mask from indices
+            pos_idx_per_image_mask = torch.zeros_like(
+                matched_idxs_per_image, dtype=torch.uint8
+            )
+            neg_idx_per_image_mask = torch.zeros_like(
+                matched_idxs_per_image, dtype=torch.uint8
+            )
+
+            pos_idx_per_image_mask[pos_idx_per_image] = 1
+            neg_idx_per_image_mask[neg_idx_per_image] = 1
+
+            pos_idx.append(pos_idx_per_image_mask)
+            neg_idx.append(neg_idx_per_image_mask)
+
+        return pos_idx, neg_idx
+
+
+@torch.jit._script_if_tracing
+def encode_boxes(reference_boxes, proposals, weights):
+    # type: (torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
+    """
+    Encode a set of proposals with respect to some
+    reference boxes
+
+    Arguments:
+        reference_boxes (Tensor): reference boxes(gt)
+        proposals (Tensor): boxes to be encoded(anchors)
+        weights:
+    """
+
+    # perform some unpacking to make it JIT-fusion friendly
+    wx = weights[0]
+    wy = weights[1]
+    ww = weights[2]
+    wh = weights[3]
+
+    # unsqueeze()
+    # Returns a new tensor with a dimension of size one inserted at the specified position.
+    proposals_x1 = proposals[:, 0].unsqueeze(1)
+    proposals_y1 = proposals[:, 1].unsqueeze(1)
+    proposals_x2 = proposals[:, 2].unsqueeze(1)
+    proposals_y2 = proposals[:, 3].unsqueeze(1)
+
+    reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
+    reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
+    reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
+    reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)
+
+    # implementation starts here
+    # parse widths and heights
+    ex_widths = proposals_x2 - proposals_x1
+    ex_heights = proposals_y2 - proposals_y1
+    # parse coordinate of center point
+    ex_ctr_x = proposals_x1 + 0.5 * ex_widths
+    ex_ctr_y = proposals_y1 + 0.5 * ex_heights
+
+    gt_widths = reference_boxes_x2 - reference_boxes_x1
+    gt_heights = reference_boxes_y2 - reference_boxes_y1
+    gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
+    gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
+
+    targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
+    targets_dw = ww * torch.log(gt_widths / ex_widths)
+    targets_dh = wh * torch.log(gt_heights / ex_heights)
+
+    targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
+    return targets
+
+
+class BoxCoder(object):
+    """
+    This class encodes and decodes a set of bounding boxes into
+    the representation used for training the regressors.
+    """
+
+    def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
+        # type: (Tuple[float, float, float, float], float) -> None
+        """
+        Arguments:
+            weights (4-element tuple)
+            bbox_xform_clip (float)
+        """
+        self.weights = weights
+        self.bbox_xform_clip = bbox_xform_clip
+
+    def encode(self, reference_boxes, proposals):
+        # type: (List[Tensor], List[Tensor]) -> List[Tensor]
+        """
+        结合anchors和与之对应的gt计算regression参数
+        Args:
+            reference_boxes: List[Tensor] 每个proposal/anchor对应的gt_boxes
+            proposals: List[Tensor] anchors/proposals
+
+        Returns: regression parameters
+
+        """
+        # 统计每张图像的anchors个数，方便后面拼接在一起处理后在分开
+        # reference_boxes和proposal数据结构相同
+        boxes_per_image = [len(b) for b in reference_boxes]
+        reference_boxes = torch.cat(reference_boxes, dim=0)
+        proposals = torch.cat(proposals, dim=0)
+
+        # targets_dx, targets_dy, targets_dw, targets_dh
+        targets = self.encode_single(reference_boxes, proposals)
+        return targets.split(boxes_per_image, 0)
+
+    def encode_single(self, reference_boxes, proposals):
+        """
+        Encode a set of proposals with respect to some
+        reference boxes
+
+        Arguments:
+            reference_boxes (Tensor): reference boxes
+            proposals (Tensor): boxes to be encoded
+        """
+        dtype = reference_boxes.dtype
+        device = reference_boxes.device
+        weights = torch.as_tensor(self.weights, dtype=dtype, device=device)
+        targets = encode_boxes(reference_boxes, proposals, weights)
+
+        return targets
+
+    def decode(self, rel_codes, boxes):
+        # type: (Tensor, List[Tensor]) -> Tensor
+        """
+
+        Args:
+            rel_codes: bbox regression parameters
+            boxes: anchors/proposals
+
+        Returns:
+
+        """
+        assert isinstance(boxes, (list, tuple))
+        assert isinstance(rel_codes, torch.Tensor)
+        boxes_per_image = [b.size(0) for b in boxes]
+        concat_boxes = torch.cat(boxes, dim=0)
+
+        box_sum = 0
+        for val in boxes_per_image:
+            box_sum += val
+
+        # 将预测的bbox回归参数应用到对应anchors上得到预测bbox的坐标
+        pred_boxes = self.decode_single(
+            rel_codes, concat_boxes
+        )
+
+        # 防止pred_boxes为空时导致reshape报错
+        if box_sum > 0:
+            pred_boxes = pred_boxes.reshape(box_sum, -1, 4)
+
+        return pred_boxes
+
+    def decode_single(self, rel_codes, boxes):
+        """
+        From a set of original boxes and encoded relative box offsets,
+        get the decoded boxes.
+
+        Arguments:
+            rel_codes (Tensor): encoded boxes (bbox regression parameters)
+            boxes (Tensor): reference boxes (anchors/proposals)
+        """
+        boxes = boxes.to(rel_codes.dtype)
+
+        # xmin, ymin, xmax, ymax
+        widths = boxes[:, 2] - boxes[:, 0]   # anchor/proposal宽度
+        heights = boxes[:, 3] - boxes[:, 1]  # anchor/proposal高度
+        ctr_x = boxes[:, 0] + 0.5 * widths   # anchor/proposal中心x坐标
+        ctr_y = boxes[:, 1] + 0.5 * heights  # anchor/proposal中心y坐标
+
+        wx, wy, ww, wh = self.weights  # RPN中为[1,1,1,1], fastrcnn中为[10,10,5,5]
+        dx = rel_codes[:, 0::4] / wx   # 预测anchors/proposals的中心坐标x回归参数
+        dy = rel_codes[:, 1::4] / wy   # 预测anchors/proposals的中心坐标y回归参数
+        dw = rel_codes[:, 2::4] / ww   # 预测anchors/proposals的宽度回归参数
+        dh = rel_codes[:, 3::4] / wh   # 预测anchors/proposals的高度回归参数
+
+        # limit max value, prevent sending too large values into torch.exp()
+        # self.bbox_xform_clip=math.log(1000. / 16)   4.135
+        dw = torch.clamp(dw, max=self.bbox_xform_clip)
+        dh = torch.clamp(dh, max=self.bbox_xform_clip)
+
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+
+        # xmin
+        pred_boxes1 = pred_ctr_x - torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w
+        # ymin
+        pred_boxes2 = pred_ctr_y - torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
+        # xmax
+        pred_boxes3 = pred_ctr_x + torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w
+        # ymax
+        pred_boxes4 = pred_ctr_y + torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
+
+        pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1)
+        return pred_boxes
+
+
+class Matcher(object):
+    BELOW_LOW_THRESHOLD = -1
+    BETWEEN_THRESHOLDS = -2
+
+    __annotations__ = {
+        'BELOW_LOW_THRESHOLD': int,
+        'BETWEEN_THRESHOLDS': int,
+    }
+
+    def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
+        # type: (float, float, bool) -> None
+        """
+        Args:
+            high_threshold (float): quality values greater than or equal to
+                this value are candidate matches.
+            low_threshold (float): a lower quality threshold used to stratify
+                matches into three levels:
+                1) matches >= high_threshold
+                2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold)
+                3) BELOW_LOW_THRESHOLD matches in [0, low_threshold)
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions that have only low-quality match candidates. See
+                set_low_quality_matches_ for more details.
+        """
+        self.BELOW_LOW_THRESHOLD = -1
+        self.BETWEEN_THRESHOLDS = -2
+        assert low_threshold <= high_threshold
+        self.high_threshold = high_threshold  # 0.7
+        self.low_threshold = low_threshold    # 0.3
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        计算anchors与每个gtboxes匹配的iou最大值，并记录索引，
+        iou<low_threshold索引值为-1， low_threshold<=iou<high_threshold索引值为-2
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+            pairwise quality between M ground-truth elements and N predicted elements.
+
+        Returns:
+            matches (Tensor[int64]): an N tensor where N[i] is a matched gt in
+            [0, M - 1] or a negative value indicating that prediction i could not
+            be matched.
+        """
+        if match_quality_matrix.numel() == 0:
+            # empty targets or proposals not supported during training
+            if match_quality_matrix.shape[0] == 0:
+                raise ValueError(
+                    "No ground-truth boxes available for one of the images "
+                    "during training")
+            else:
+                raise ValueError(
+                    "No proposal boxes available for one of the images "
+                    "during training")
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        # M x N 的每一列代表一个anchors与所有gt的匹配iou值
+        # matched_vals代表每列的最大值，即每个anchors与所有gt匹配的最大iou值
+        # matches对应最大值所在的索引
+        matched_vals, matches = match_quality_matrix.max(dim=0)  # the dimension to reduce.
+        if self.allow_low_quality_matches:
+            all_matches = matches.clone()
+        else:
+            all_matches = None
+
+        # Assign candidate matches with low quality to negative (unassigned) values
+        # 计算iou小于low_threshold的索引
+        below_low_threshold = matched_vals < self.low_threshold
+        # 计算iou在low_threshold与high_threshold之间的索引值
+        between_thresholds = (matched_vals >= self.low_threshold) & (
+            matched_vals < self.high_threshold
+        )
+        # iou小于low_threshold的matches索引置为-1
+        matches[below_low_threshold] = self.BELOW_LOW_THRESHOLD  # -1
+
+        # iou在[low_threshold, high_threshold]之间的matches索引置为-2
+        matches[between_thresholds] = self.BETWEEN_THRESHOLDS    # -2
+
+        if self.allow_low_quality_matches:
+            assert all_matches is not None
+            self.set_low_quality_matches_(matches, all_matches, match_quality_matrix)
+
+        return matches
+
+    def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth with which it has the highest
+        quality value.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        # 对于每个gt boxes寻找与其iou最大的anchor，
+        # highest_quality_foreach_gt为匹配到的最大iou值
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)  # the dimension to reduce.
+
+        # Find highest quality match available, even if it is low, including ties
+        # 寻找每个gt boxes与其iou最大的anchor索引，一个gt匹配到的最大iou可能有多个anchor
+        # gt_pred_pairs_of_highest_quality = torch.nonzero(
+        #     match_quality_matrix == highest_quality_foreach_gt[:, None]
+        # )
+        gt_pred_pairs_of_highest_quality = torch.where(
+            torch.eq(match_quality_matrix, highest_quality_foreach_gt[:, None])
+        )
+        # Example gt_pred_pairs_of_highest_quality:
+        #   tensor([[    0, 39796],
+        #           [    1, 32055],
+        #           [    1, 32070],
+        #           [    2, 39190],
+        #           [    2, 40255],
+        #           [    3, 40390],
+        #           [    3, 41455],
+        #           [    4, 45470],
+        #           [    5, 45325],
+        #           [    5, 46390]])
+        # Each row is a (gt index, prediction index)
+        # Note how gt items 1, 2, 3, and 5 each have two ties
+
+        # gt_pred_pairs_of_highest_quality[:, 0]代表是对应的gt index(不需要)
+        # pre_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1]
+        pre_inds_to_update = gt_pred_pairs_of_highest_quality[1]
+        # 保留该anchor匹配gt最大iou的索引，即使iou低于设定的阈值
+        matches[pre_inds_to_update] = all_matches[pre_inds_to_update]
+
+
+def smooth_l1_loss(input, target, beta: float = 1. / 9, size_average: bool = True):
+    """
+    very similar to the smooth_l1_loss from pytorch, but with
+    the extra beta parameter
+    """
+    n = torch.abs(input - target)
+    # cond = n < beta
+    cond = torch.lt(n, beta)
+    loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+    if size_average:
+        return loss.mean()
+    return loss.sum()
diff --git a/pytorch_object_detection/mask_rcnn/network_files/faster_rcnn_framework.py b/pytorch_object_detection/mask_rcnn/network_files/faster_rcnn_framework.py
new file mode 100644
index 000000000..827d8c653
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/faster_rcnn_framework.py
@@ -0,0 +1,354 @@
+import warnings
+from collections import OrderedDict
+from typing import Tuple, List, Dict, Optional, Union
+
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from torchvision.ops import MultiScaleRoIAlign
+
+from .roi_head import RoIHeads
+from .transform import GeneralizedRCNNTransform
+from .rpn_function import AnchorsGenerator, RPNHead, RegionProposalNetwork
+
+
+class FasterRCNNBase(nn.Module):
+    """
+    Main class for Generalized R-CNN.
+
+    Arguments:
+        backbone (nn.Module):
+        rpn (nn.Module):
+        roi_heads (nn.Module): takes the features + the proposals from the RPN and computes
+            detections / masks from it.
+        transform (nn.Module): performs the data transformation from the inputs to feed into
+            the model
+    """
+
+    def __init__(self, backbone, rpn, roi_heads, transform):
+        super(FasterRCNNBase, self).__init__()
+        self.transform = transform
+        self.backbone = backbone
+        self.rpn = rpn
+        self.roi_heads = roi_heads
+        # used only on torchscript mode
+        self._has_warned = False
+
+    @torch.jit.unused
+    def eager_outputs(self, losses, detections):
+        # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Union[Dict[str, Tensor], List[Dict[str, Tensor]]]
+        if self.training:
+            return losses
+
+        return detections
+
+    def forward(self, images, targets=None):
+        # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
+        """
+        Arguments:
+            images (list[Tensor]): images to be processed
+            targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
+
+        Returns:
+            result (list[BoxList] or dict[Tensor]): the output from the model.
+                During training, it returns a dict[Tensor] which contains the losses.
+                During testing, it returns list[BoxList] contains additional fields
+                like `scores`, `labels` and `mask` (for Mask R-CNN models).
+
+        """
+        if self.training and targets is None:
+            raise ValueError("In training mode, targets should be passed")
+
+        if self.training:
+            assert targets is not None
+            for target in targets:         # 进一步判断传入的target的boxes参数是否符合规定
+                boxes = target["boxes"]
+                if isinstance(boxes, torch.Tensor):
+                    if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
+                        raise ValueError("Expected target boxes to be a tensor"
+                                         "of shape [N, 4], got {:}.".format(
+                                          boxes.shape))
+                else:
+                    raise ValueError("Expected target boxes to be of type "
+                                     "Tensor, got {:}.".format(type(boxes)))
+
+        original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], [])
+        for img in images:
+            val = img.shape[-2:]
+            assert len(val) == 2  # 防止输入的是个一维向量
+            original_image_sizes.append((val[0], val[1]))
+        # original_image_sizes = [img.shape[-2:] for img in images]
+
+        images, targets = self.transform(images, targets)  # 对图像进行预处理
+        # print(images.tensors.shape)
+        features = self.backbone(images.tensors)  # 将图像输入backbone得到特征图
+        if isinstance(features, torch.Tensor):  # 若只在一层特征层上预测，将feature放入有序字典中，并编号为‘0’
+            features = OrderedDict([('0', features)])  # 若在多层特征层上预测，传入的就是一个有序字典
+
+        # 将特征层以及标注target信息传入rpn中
+        # proposals: List[Tensor], Tensor_shape: [num_proposals, 4],
+        # 每个proposals是绝对坐标，且为(x1, y1, x2, y2)格式
+        proposals, proposal_losses = self.rpn(images, features, targets)
+
+        # 将rpn生成的数据以及标注target信息传入fast rcnn后半部分
+        detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+
+        # 对网络的预测结果进行后处理（主要将bboxes还原到原图像尺度上）
+        detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
+
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+
+        if torch.jit.is_scripting():
+            if not self._has_warned:
+                warnings.warn("RCNN always returns a (Losses, Detections) tuple in scripting")
+                self._has_warned = True
+            return losses, detections
+        else:
+            return self.eager_outputs(losses, detections)
+
+        # if self.training:
+        #     return losses
+        #
+        # return detections
+
+
+class TwoMLPHead(nn.Module):
+    """
+    Standard heads for FPN-based models
+
+    Arguments:
+        in_channels (int): number of input channels
+        representation_size (int): size of the intermediate representation
+    """
+
+    def __init__(self, in_channels, representation_size):
+        super(TwoMLPHead, self).__init__()
+
+        self.fc6 = nn.Linear(in_channels, representation_size)
+        self.fc7 = nn.Linear(representation_size, representation_size)
+
+    def forward(self, x):
+        x = x.flatten(start_dim=1)
+
+        x = F.relu(self.fc6(x))
+        x = F.relu(self.fc7(x))
+
+        return x
+
+
+class FastRCNNPredictor(nn.Module):
+    """
+    Standard classification + bounding box regression layers
+    for Fast R-CNN.
+
+    Arguments:
+        in_channels (int): number of input channels
+        num_classes (int): number of output classes (including background)
+    """
+
+    def __init__(self, in_channels, num_classes):
+        super(FastRCNNPredictor, self).__init__()
+        self.cls_score = nn.Linear(in_channels, num_classes)
+        self.bbox_pred = nn.Linear(in_channels, num_classes * 4)
+
+    def forward(self, x):
+        if x.dim() == 4:
+            assert list(x.shape[2:]) == [1, 1]
+        x = x.flatten(start_dim=1)
+        scores = self.cls_score(x)
+        bbox_deltas = self.bbox_pred(x)
+
+        return scores, bbox_deltas
+
+
+class FasterRCNN(FasterRCNNBase):
+    """
+    Implements Faster R-CNN.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes.
+
+    The behavior of the model changes depending if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    containing:
+        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values
+          between 0 and H and 0 and W
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses for both the RPN and the R-CNN.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows:
+        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between
+          0 and H and 0 and W
+        - labels (Int64Tensor[N]): the predicted labels for each image
+        - scores (Tensor[N]): the scores or each prediction
+
+    Arguments:
+        backbone (nn.Module): the network used to compute the features for the model.
+            It should contain a out_channels attribute, which indicates the number of output
+            channels that each feature map has (and it should be the same for all feature maps).
+            The backbone should return a single Tensor or and OrderedDict[Tensor].
+        num_classes (int): number of output classes of the model (including the background).
+            If box_predictor is specified, num_classes should be None.
+        min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
+        max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
+        image_mean (Tuple[float, float, float]): mean values used for input normalization.
+            They are generally the mean values of the dataset on which the backbone has been trained
+            on
+        image_std (Tuple[float, float, float]): std values used for input normalization.
+            They are generally the std values of the dataset on which the backbone has been trained on
+        rpn_anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps.
+        rpn_head (nn.Module): module that computes the objectness and regression deltas from the RPN
+        rpn_pre_nms_top_n_train (int): number of proposals to keep before applying NMS during training
+        rpn_pre_nms_top_n_test (int): number of proposals to keep before applying NMS during testing
+        rpn_post_nms_top_n_train (int): number of proposals to keep after applying NMS during training
+        rpn_post_nms_top_n_test (int): number of proposals to keep after applying NMS during testing
+        rpn_nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
+        rpn_fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training of the RPN.
+        rpn_bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+            considered as negative during training of the RPN.
+        rpn_batch_size_per_image (int): number of anchors that are sampled during training of the RPN
+            for computing the loss
+        rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
+            of the RPN
+        rpn_score_thresh (float): during inference, only return proposals with a classification score
+            greater than rpn_score_thresh
+        box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
+            the locations indicated by the bounding boxes
+        box_head (nn.Module): module that takes the cropped feature maps as input
+        box_predictor (nn.Module): module that takes the output of box_head and returns the
+            classification logits and box regression deltas.
+        box_score_thresh (float): during inference, only return proposals with a classification score
+            greater than box_score_thresh
+        box_nms_thresh (float): NMS threshold for the prediction head. Used during inference
+        box_detections_per_img (int): maximum number of detections per image, for all classes.
+        box_fg_iou_thresh (float): minimum IoU between the proposals and the GT box so that they can be
+            considered as positive during training of the classification head
+        box_bg_iou_thresh (float): maximum IoU between the proposals and the GT box so that they can be
+            considered as negative during training of the classification head
+        box_batch_size_per_image (int): number of proposals that are sampled during training of the
+            classification head
+        box_positive_fraction (float): proportion of positive proposals in a mini-batch during training
+            of the classification head
+        bbox_reg_weights (Tuple[float, float, float, float]): weights for the encoding/decoding of the
+            bounding boxes
+
+    """
+
+    def __init__(self, backbone, num_classes=None,
+                 # transform parameter
+                 min_size=800, max_size=1333,      # 预处理resize时限制的最小尺寸与最大尺寸
+                 image_mean=None, image_std=None,  # 预处理normalize时使用的均值和方差
+                 # RPN parameters
+                 rpn_anchor_generator=None, rpn_head=None,
+                 rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000,    # rpn中在nms处理前保留的proposal数(根据score)
+                 rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000,  # rpn中在nms处理后保留的proposal数
+                 rpn_nms_thresh=0.7,  # rpn中进行nms处理时使用的iou阈值
+                 rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3,  # rpn计算损失时，采集正负样本设置的阈值
+                 rpn_batch_size_per_image=256, rpn_positive_fraction=0.5,  # rpn计算损失时采样的样本数，以及正样本占总样本的比例
+                 rpn_score_thresh=0.0,
+                 # Box parameters
+                 box_roi_pool=None, box_head=None, box_predictor=None,
+                 # 移除低目标概率      fast rcnn中进行nms处理的阈值   对预测结果根据score排序取前100个目标
+                 box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100,
+                 box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5,   # fast rcnn计算误差时，采集正负样本设置的阈值
+                 box_batch_size_per_image=512, box_positive_fraction=0.25,  # fast rcnn计算误差时采样的样本数，以及正样本占所有样本的比例
+                 bbox_reg_weights=None):
+        if not hasattr(backbone, "out_channels"):
+            raise ValueError(
+                "backbone should contain an attribute out_channels"
+                "specifying the number of output channels  (assumed to be the"
+                "same for all the levels"
+            )
+
+        # assert isinstance(rpn_anchor_generator, (AnchorsGenerator, type(None)))
+        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))
+
+        if num_classes is not None:
+            if box_predictor is not None:
+                raise ValueError("num_classes should be None when box_predictor "
+                                 "is specified")
+        else:
+            if box_predictor is None:
+                raise ValueError("num_classes should not be None when box_predictor "
+                                 "is not specified")
+
+        # 预测特征层的channels
+        out_channels = backbone.out_channels
+
+        # 若anchor生成器为空，则自动生成针对resnet50_fpn的anchor生成器
+        if rpn_anchor_generator is None:
+            anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
+            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+            rpn_anchor_generator = AnchorsGenerator(
+                anchor_sizes, aspect_ratios
+            )
+
+        # 生成RPN通过滑动窗口预测网络部分
+        if rpn_head is None:
+            rpn_head = RPNHead(
+                out_channels, rpn_anchor_generator.num_anchors_per_location()[0]
+            )
+
+        # 默认rpn_pre_nms_top_n_train = 2000, rpn_pre_nms_top_n_test = 1000,
+        # 默认rpn_post_nms_top_n_train = 2000, rpn_post_nms_top_n_test = 1000,
+        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
+        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
+
+        # 定义整个RPN框架
+        rpn = RegionProposalNetwork(
+            rpn_anchor_generator, rpn_head,
+            rpn_fg_iou_thresh, rpn_bg_iou_thresh,
+            rpn_batch_size_per_image, rpn_positive_fraction,
+            rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh,
+            score_thresh=rpn_score_thresh)
+
+        #  Multi-scale RoIAlign pooling
+        if box_roi_pool is None:
+            box_roi_pool = MultiScaleRoIAlign(
+                featmap_names=['0', '1', '2', '3'],  # 在哪些特征层进行roi pooling
+                output_size=[7, 7],
+                sampling_ratio=2)
+
+        # fast RCNN中roi pooling后的展平处理两个全连接层部分
+        if box_head is None:
+            resolution = box_roi_pool.output_size[0]  # 默认等于7
+            representation_size = 1024
+            box_head = TwoMLPHead(
+                out_channels * resolution ** 2,
+                representation_size
+            )
+
+        # 在box_head的输出上预测部分
+        if box_predictor is None:
+            representation_size = 1024
+            box_predictor = FastRCNNPredictor(
+                representation_size,
+                num_classes)
+
+        # 将roi pooling, box_head以及box_predictor结合在一起
+        roi_heads = RoIHeads(
+            # box
+            box_roi_pool, box_head, box_predictor,
+            box_fg_iou_thresh, box_bg_iou_thresh,  # 0.5  0.5
+            box_batch_size_per_image, box_positive_fraction,  # 512  0.25
+            bbox_reg_weights,
+            box_score_thresh, box_nms_thresh, box_detections_per_img)  # 0.05  0.5  100
+
+        if image_mean is None:
+            image_mean = [0.485, 0.456, 0.406]
+        if image_std is None:
+            image_std = [0.229, 0.224, 0.225]
+
+        # 对数据进行标准化，缩放，打包成batch等处理部分
+        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)
+
+        super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
diff --git a/pytorch_object_detection/mask_rcnn/network_files/image_list.py b/pytorch_object_detection/mask_rcnn/network_files/image_list.py
new file mode 100644
index 000000000..a1b36f334
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/image_list.py
@@ -0,0 +1,27 @@
+from typing import List, Tuple
+from torch import Tensor
+
+
+class ImageList(object):
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size,
+    and storing in a field the original sizes of each image
+    """
+
+    def __init__(self, tensors, image_sizes):
+        # type: (Tensor, List[Tuple[int, int]]) -> None
+        """
+        Arguments:
+            tensors (tensor) padding后的图像数据
+            image_sizes (list[tuple[int, int]])  padding前的图像尺寸
+        """
+        self.tensors = tensors
+        self.image_sizes = image_sizes
+
+    def to(self, device):
+        # type: (Device) -> ImageList # noqa
+        cast_tensor = self.tensors.to(device)
+        return ImageList(cast_tensor, self.image_sizes)
+
diff --git a/pytorch_object_detection/mask_rcnn/network_files/mask_rcnn.py b/pytorch_object_detection/mask_rcnn/network_files/mask_rcnn.py
new file mode 100644
index 000000000..97a8d7fe9
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/mask_rcnn.py
@@ -0,0 +1,239 @@
+from collections import OrderedDict
+import torch.nn as nn
+from torchvision.ops import MultiScaleRoIAlign
+
+from .faster_rcnn_framework import FasterRCNN
+
+
+class MaskRCNN(FasterRCNN):
+    """
+        Implements Mask R-CNN.
+
+        The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+        image, and should be in 0-1 range. Different images can have different sizes.
+
+        The behavior of the model changes depending if it is in training or evaluation mode.
+
+        During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+        containing:
+            - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+              ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+            - labels (Int64Tensor[N]): the class label for each ground-truth box
+            - masks (UInt8Tensor[N, H, W]): the segmentation binary masks for each instance
+
+        The model returns a Dict[Tensor] during training, containing the classification and regression
+        losses for both the RPN and the R-CNN, and the mask loss.
+
+        During inference, the model requires only the input tensors, and returns the post-processed
+        predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+        follows:
+            - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+              ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+            - labels (Int64Tensor[N]): the predicted labels for each image
+            - scores (Tensor[N]): the scores or each prediction
+            - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
+              obtain the final segmentation masks, the soft masks can be thresholded, generally
+              with a value of 0.5 (mask >= 0.5)
+
+        Args:
+            backbone (nn.Module): the network used to compute the features for the model.
+                It should contain a out_channels attribute, which indicates the number of output
+                channels that each feature map has (and it should be the same for all feature maps).
+                The backbone should return a single Tensor or and OrderedDict[Tensor].
+            num_classes (int): number of output classes of the model (including the background).
+                If box_predictor is specified, num_classes should be None.
+            min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
+            max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
+            image_mean (Tuple[float, float, float]): mean values used for input normalization.
+                They are generally the mean values of the dataset on which the backbone has been trained
+                on
+            image_std (Tuple[float, float, float]): std values used for input normalization.
+                They are generally the std values of the dataset on which the backbone has been trained on
+            rpn_anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+                maps.
+            rpn_head (nn.Module): module that computes the objectness and regression deltas from the RPN
+            rpn_pre_nms_top_n_train (int): number of proposals to keep before applying NMS during training
+            rpn_pre_nms_top_n_test (int): number of proposals to keep before applying NMS during testing
+            rpn_post_nms_top_n_train (int): number of proposals to keep after applying NMS during training
+            rpn_post_nms_top_n_test (int): number of proposals to keep after applying NMS during testing
+            rpn_nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
+            rpn_fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+                considered as positive during training of the RPN.
+            rpn_bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+                considered as negative during training of the RPN.
+            rpn_batch_size_per_image (int): number of anchors that are sampled during training of the RPN
+                for computing the loss
+            rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
+                of the RPN
+            rpn_score_thresh (float): during inference, only return proposals with a classification score
+                greater than rpn_score_thresh
+            box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
+                the locations indicated by the bounding boxes
+            box_head (nn.Module): module that takes the cropped feature maps as input
+            box_predictor (nn.Module): module that takes the output of box_head and returns the
+                classification logits and box regression deltas.
+            box_score_thresh (float): during inference, only return proposals with a classification score
+                greater than box_score_thresh
+            box_nms_thresh (float): NMS threshold for the prediction head. Used during inference
+            box_detections_per_img (int): maximum number of detections per image, for all classes.
+            box_fg_iou_thresh (float): minimum IoU between the proposals and the GT box so that they can be
+                considered as positive during training of the classification head
+            box_bg_iou_thresh (float): maximum IoU between the proposals and the GT box so that they can be
+                considered as negative during training of the classification head
+            box_batch_size_per_image (int): number of proposals that are sampled during training of the
+                classification head
+            box_positive_fraction (float): proportion of positive proposals in a mini-batch during training
+                of the classification head
+            bbox_reg_weights (Tuple[float, float, float, float]): weights for the encoding/decoding of the
+                bounding boxes
+            mask_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
+                 the locations indicated by the bounding boxes, which will be used for the mask head.
+            mask_head (nn.Module): module that takes the cropped feature maps as input
+            mask_predictor (nn.Module): module that takes the output of the mask_head and returns the
+                segmentation mask logits
+
+        """
+
+    def __init__(
+            self,
+            backbone,
+            num_classes=None,
+            # transform parameters
+            min_size=800,
+            max_size=1333,
+            image_mean=None,
+            image_std=None,
+            # RPN parameters
+            rpn_anchor_generator=None,
+            rpn_head=None,
+            rpn_pre_nms_top_n_train=2000,
+            rpn_pre_nms_top_n_test=1000,
+            rpn_post_nms_top_n_train=2000,
+            rpn_post_nms_top_n_test=1000,
+            rpn_nms_thresh=0.7,
+            rpn_fg_iou_thresh=0.7,
+            rpn_bg_iou_thresh=0.3,
+            rpn_batch_size_per_image=256,
+            rpn_positive_fraction=0.5,
+            rpn_score_thresh=0.0,
+            # Box parameters
+            box_roi_pool=None,
+            box_head=None,
+            box_predictor=None,
+            box_score_thresh=0.05,
+            box_nms_thresh=0.5,
+            box_detections_per_img=100,
+            box_fg_iou_thresh=0.5,
+            box_bg_iou_thresh=0.5,
+            box_batch_size_per_image=512,
+            box_positive_fraction=0.25,
+            bbox_reg_weights=None,
+            # Mask parameters
+            mask_roi_pool=None,
+            mask_head=None,
+            mask_predictor=None,
+    ):
+
+        if not isinstance(mask_roi_pool, (MultiScaleRoIAlign, type(None))):
+            raise TypeError(
+                f"mask_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(mask_roi_pool)}"
+            )
+
+        if num_classes is not None:
+            if mask_predictor is not None:
+                raise ValueError("num_classes should be None when mask_predictor is specified")
+
+        out_channels = backbone.out_channels
+
+        if mask_roi_pool is None:
+            mask_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=14, sampling_ratio=2)
+
+        if mask_head is None:
+            mask_layers = (256, 256, 256, 256)
+            mask_dilation = 1
+            mask_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation)
+
+        if mask_predictor is None:
+            mask_predictor_in_channels = 256
+            mask_dim_reduced = 256
+            mask_predictor = MaskRCNNPredictor(mask_predictor_in_channels, mask_dim_reduced, num_classes)
+
+        super().__init__(
+            backbone,
+            num_classes,
+            # transform parameters
+            min_size,
+            max_size,
+            image_mean,
+            image_std,
+            # RPN-specific parameters
+            rpn_anchor_generator,
+            rpn_head,
+            rpn_pre_nms_top_n_train,
+            rpn_pre_nms_top_n_test,
+            rpn_post_nms_top_n_train,
+            rpn_post_nms_top_n_test,
+            rpn_nms_thresh,
+            rpn_fg_iou_thresh,
+            rpn_bg_iou_thresh,
+            rpn_batch_size_per_image,
+            rpn_positive_fraction,
+            rpn_score_thresh,
+            # Box parameters
+            box_roi_pool,
+            box_head,
+            box_predictor,
+            box_score_thresh,
+            box_nms_thresh,
+            box_detections_per_img,
+            box_fg_iou_thresh,
+            box_bg_iou_thresh,
+            box_batch_size_per_image,
+            box_positive_fraction,
+            bbox_reg_weights,
+        )
+
+        self.roi_heads.mask_roi_pool = mask_roi_pool
+        self.roi_heads.mask_head = mask_head
+        self.roi_heads.mask_predictor = mask_predictor
+
+
+class MaskRCNNHeads(nn.Sequential):
+    def __init__(self, in_channels, layers, dilation):
+        """
+        Args:
+            in_channels (int): number of input channels
+            layers (tuple): feature dimensions of each FCN layer
+            dilation (int): dilation rate of kernel
+        """
+        d = OrderedDict()
+        next_feature = in_channels
+
+        for layer_idx, layers_features in enumerate(layers, 1):
+            d[f"mask_fcn{layer_idx}"] = nn.Conv2d(next_feature,
+                                                  layers_features,
+                                                  kernel_size=3,
+                                                  stride=1,
+                                                  padding=dilation,
+                                                  dilation=dilation)
+            d[f"relu{layer_idx}"] = nn.ReLU(inplace=True)
+            next_feature = layers_features
+
+        super().__init__(d)
+        # initial params
+        for name, param in self.named_parameters():
+            if "weight" in name:
+                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+
+
+class MaskRCNNPredictor(nn.Sequential):
+    def __init__(self, in_channels, dim_reduced, num_classes):
+        super().__init__(OrderedDict([
+            ("conv5_mask", nn.ConvTranspose2d(in_channels, dim_reduced, 2, 2, 0)),
+            ("relu", nn.ReLU(inplace=True)),
+            ("mask_fcn_logits", nn.Conv2d(dim_reduced, num_classes, 1, 1, 0))
+        ]))
+        # initial params
+        for name, param in self.named_parameters():
+            if "weight" in name:
+                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
diff --git a/pytorch_object_detection/mask_rcnn/network_files/roi_head.py b/pytorch_object_detection/mask_rcnn/network_files/roi_head.py
new file mode 100644
index 000000000..7269f58da
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/roi_head.py
@@ -0,0 +1,560 @@
+from typing import Optional, List, Dict, Tuple
+
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+from torchvision.ops import roi_align
+
+from . import det_utils
+from . import boxes as box_ops
+
+
+def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
+    # type: (Tensor, Tensor, List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
+    """
+    Computes the loss for Faster R-CNN.
+
+    Arguments:
+        class_logits : 预测类别概率信息，shape=[num_anchors, num_classes]
+        box_regression : 预测边目标界框回归信息
+        labels : 真实类别信息
+        regression_targets : 真实目标边界框信息
+
+    Returns:
+        classification_loss (Tensor)
+        box_loss (Tensor)
+    """
+
+    labels = torch.cat(labels, dim=0)
+    regression_targets = torch.cat(regression_targets, dim=0)
+
+    # 计算类别损失信息
+    classification_loss = F.cross_entropy(class_logits, labels)
+
+    # get indices that correspond to the regression targets for
+    # the corresponding ground truth labels, to be used with
+    # advanced indexing
+    # 返回标签类别大于0的索引
+    # sampled_pos_inds_subset = torch.nonzero(torch.gt(labels, 0)).squeeze(1)
+    sampled_pos_inds_subset = torch.where(torch.gt(labels, 0))[0]
+
+    # 返回标签类别大于0位置的类别信息
+    labels_pos = labels[sampled_pos_inds_subset]
+
+    # shape=[num_proposal, num_classes]
+    N, num_classes = class_logits.shape
+    box_regression = box_regression.reshape(N, -1, 4)
+
+    # 计算边界框损失信息
+    box_loss = det_utils.smooth_l1_loss(
+        # 获取指定索引proposal的指定类别box信息
+        box_regression[sampled_pos_inds_subset, labels_pos],
+        regression_targets[sampled_pos_inds_subset],
+        beta=1 / 9,
+        size_average=False,
+    ) / labels.numel()
+
+    return classification_loss, box_loss
+
+
+def maskrcnn_inference(x, labels):
+    # type: (Tensor, List[Tensor]) -> List[Tensor]
+    """
+    From the results of the CNN, post process the masks
+    by taking the mask corresponding to the class with max
+    probability (which are of fixed size and directly output
+    by the CNN) and return the masks in the mask field of the BoxList.
+
+    Args:
+        x (Tensor): the mask logits
+        labels (list[BoxList]): bounding boxes that are used as
+            reference, one for ech image
+
+    Returns:
+        results (list[BoxList]): one BoxList for each image, containing
+            the extra field mask
+    """
+    # 将预测值通过sigmoid激活全部缩放到0~1之间
+    mask_prob = x.sigmoid()
+
+    # select masks corresponding to the predicted classes
+    num_masks = x.shape[0]
+    # 先记录每张图片中boxes/masks的个数
+    boxes_per_image = [label.shape[0] for label in labels]
+    # 在将所有图片中的masks信息拼接在一起(拼接后统一处理能够提升并行度)
+    labels = torch.cat(labels)
+    index = torch.arange(num_masks, device=labels.device)
+    # 提取每个masks中对应预测最终类别的mask
+    mask_prob = mask_prob[index, labels][:, None]
+    # 最后再按照每张图片中的masks个数分离开
+    mask_prob = mask_prob.split(boxes_per_image, dim=0)
+
+    return mask_prob
+
+
+def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
+    # type: (Tensor, Tensor, Tensor, int) -> Tensor
+    """
+    Given segmentation masks and the bounding boxes corresponding
+    to the location of the masks in the image, this function
+    crops and resizes the masks in the position defined by the
+    boxes. This prepares the masks for them to be fed to the
+    loss computation as the targets.
+    """
+    matched_idxs = matched_idxs.to(boxes)
+    rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
+    gt_masks = gt_masks[:, None].to(rois)
+    return roi_align(gt_masks, rois, (M, M), 1.0)[:, 0]
+
+
+def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs):
+    # type: (Tensor, List[Tensor], List[Tensor], List[Tensor], List[Tensor]) -> Tensor
+    """
+
+    Args:
+        mask_logits:
+        proposals:
+        gt_masks:
+        gt_labels:
+        mask_matched_idxs:
+
+    Returns:
+        mask_loss (Tensor): scalar tensor containing the loss
+    """
+
+    # 28(FCN分支输出mask的大小)
+    discretization_size = mask_logits.shape[-1]
+    # 获取每个Proposal(全部为正样本)对应的gt类别
+    labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
+    # 根据Proposal信息在gt_masks上裁剪对应区域做为计算loss时的真正gt_mask
+    mask_targets = [
+        project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
+    ]
+
+    # 将一个batch中所有的Proposal对应信息拼接在一起(统一处理提高并行度)
+    labels = torch.cat(labels, dim=0)
+    mask_targets = torch.cat(mask_targets, dim=0)
+
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
+    # accept empty tensors, so handle it separately
+    if mask_targets.numel() == 0:
+        return mask_logits.sum() * 0
+
+    # 计算预测mask与真实gt_mask之间的BCELoss
+    mask_loss = F.binary_cross_entropy_with_logits(
+        mask_logits[torch.arange(labels.shape[0], device=labels.device), labels], mask_targets
+    )
+    return mask_loss
+
+
+class RoIHeads(torch.nn.Module):
+    __annotations__ = {
+        'box_coder': det_utils.BoxCoder,
+        'proposal_matcher': det_utils.Matcher,
+        'fg_bg_sampler': det_utils.BalancedPositiveNegativeSampler,
+    }
+
+    def __init__(self,
+                 box_roi_pool,   # Multi-scale RoIAlign pooling
+                 box_head,       # TwoMLPHead
+                 box_predictor,  # FastRCNNPredictor
+                 # Faster R-CNN training
+                 fg_iou_thresh, bg_iou_thresh,  # default: 0.5, 0.5
+                 batch_size_per_image, positive_fraction,  # default: 512, 0.25
+                 bbox_reg_weights,  # None
+                 # Faster R-CNN inference
+                 score_thresh,        # default: 0.05
+                 nms_thresh,          # default: 0.5
+                 detection_per_img,   # default: 100
+                 # Mask
+                 mask_roi_pool=None,
+                 mask_head=None,
+                 mask_predictor=None,
+                 ):
+        super(RoIHeads, self).__init__()
+
+        self.box_similarity = box_ops.box_iou
+        # assign ground-truth boxes for each proposal
+        self.proposal_matcher = det_utils.Matcher(
+            fg_iou_thresh,  # default: 0.5
+            bg_iou_thresh,  # default: 0.5
+            allow_low_quality_matches=False)
+
+        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
+            batch_size_per_image,  # default: 512
+            positive_fraction)     # default: 0.25
+
+        if bbox_reg_weights is None:
+            bbox_reg_weights = (10., 10., 5., 5.)
+        self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
+
+        self.box_roi_pool = box_roi_pool    # Multi-scale RoIAlign pooling
+        self.box_head = box_head            # TwoMLPHead
+        self.box_predictor = box_predictor  # FastRCNNPredictor
+
+        self.score_thresh = score_thresh  # default: 0.05
+        self.nms_thresh = nms_thresh      # default: 0.5
+        self.detection_per_img = detection_per_img  # default: 100
+
+        self.mask_roi_pool = mask_roi_pool
+        self.mask_head = mask_head
+        self.mask_predictor = mask_predictor
+
+    def has_mask(self):
+        if self.mask_roi_pool is None:
+            return False
+        if self.mask_head is None:
+            return False
+        if self.mask_predictor is None:
+            return False
+        return True
+
+    def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
+        # type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
+        """
+        为每个proposal匹配对应的gt_box，并划分到正负样本中
+        Args:
+            proposals:
+            gt_boxes:
+            gt_labels:
+
+        Returns:
+
+        """
+        matched_idxs = []
+        labels = []
+        # 遍历每张图像的proposals, gt_boxes, gt_labels信息
+        for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
+            if gt_boxes_in_image.numel() == 0:  # 该张图像中没有gt框，为背景
+                # background image
+                device = proposals_in_image.device
+                clamped_matched_idxs_in_image = torch.zeros(
+                    (proposals_in_image.shape[0],), dtype=torch.int64, device=device
+                )
+                labels_in_image = torch.zeros(
+                    (proposals_in_image.shape[0],), dtype=torch.int64, device=device
+                )
+            else:
+                # set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
+                # 计算proposal与每个gt_box的iou重合度
+                match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image)
+
+                # 计算proposal与每个gt_box匹配的iou最大值，并记录索引，
+                # iou < low_threshold索引值为 -1， low_threshold <= iou < high_threshold索引值为 -2
+                matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)
+
+                # 限制最小值，防止匹配标签时出现越界的情况
+                # 注意-1, -2对应的gt索引会调整到0,获取的标签类别为第0个gt的类别（实际上并不是）,后续会进一步处理
+                clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)
+                # 获取proposal匹配到的gt对应标签
+                labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
+                labels_in_image = labels_in_image.to(dtype=torch.int64)
+
+                # label background (below the low threshold)
+                # 将gt索引为-1的类别设置为0，即背景，负样本
+                bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD  # -1
+                labels_in_image[bg_inds] = 0
+
+                # label ignore proposals (between low and high threshold)
+                # 将gt索引为-2的类别设置为-1, 即废弃样本
+                ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS  # -2
+                labels_in_image[ignore_inds] = -1  # -1 is ignored by sampler
+
+            matched_idxs.append(clamped_matched_idxs_in_image)
+            labels.append(labels_in_image)
+        return matched_idxs, labels
+
+    def subsample(self, labels):
+        # type: (List[Tensor]) -> List[Tensor]
+        # BalancedPositiveNegativeSampler
+        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+        sampled_inds = []
+        # 遍历每张图片的正负样本索引
+        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
+            # 记录所有采集样本索引（包括正样本和负样本）
+            # img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
+            img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
+            sampled_inds.append(img_sampled_inds)
+        return sampled_inds
+
+    def add_gt_proposals(self, proposals, gt_boxes):
+        # type: (List[Tensor], List[Tensor]) -> List[Tensor]
+        """
+        将gt_boxes拼接到proposal后面
+        Args:
+            proposals: 一个batch中每张图像rpn预测的boxes
+            gt_boxes:  一个batch中每张图像对应的真实目标边界框
+
+        Returns:
+
+        """
+        proposals = [
+            torch.cat((proposal, gt_box))
+            for proposal, gt_box in zip(proposals, gt_boxes)
+        ]
+        return proposals
+
+    def check_targets(self, targets):
+        # type: (Optional[List[Dict[str, Tensor]]]) -> None
+        assert targets is not None
+        assert all(["boxes" in t for t in targets])
+        assert all(["labels" in t for t in targets])
+
+    def select_training_samples(self,
+                                proposals,  # type: List[Tensor]
+                                targets     # type: Optional[List[Dict[str, Tensor]]]
+                                ):
+        # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
+        """
+        划分正负样本，统计对应gt的标签以及边界框回归信息
+        list元素个数为batch_size
+        Args:
+            proposals: rpn预测的boxes
+            targets:
+
+        Returns:
+
+        """
+
+        # 检查target数据是否为空
+        self.check_targets(targets)
+        if targets is None:
+            raise ValueError("target should not be None.")
+
+        dtype = proposals[0].dtype
+        device = proposals[0].device
+
+        # 获取标注好的boxes以及labels信息
+        gt_boxes = [t["boxes"].to(dtype) for t in targets]
+        gt_labels = [t["labels"] for t in targets]
+
+        # append ground-truth bboxes to proposal
+        # 将gt_boxes拼接到proposal后面
+        proposals = self.add_gt_proposals(proposals, gt_boxes)
+
+        # get matching gt indices for each proposal
+        # 为每个proposal匹配对应的gt_box，并划分到正负样本中
+        matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
+        # sample a fixed proportion of positive-negative proposals
+        # 按给定数量和比例采样正负样本
+        sampled_inds = self.subsample(labels)
+        matched_gt_boxes = []
+        num_images = len(proposals)
+
+        # 遍历每张图像
+        for img_id in range(num_images):
+            # 获取每张图像的正负样本索引
+            img_sampled_inds = sampled_inds[img_id]
+            # 获取对应正负样本的proposals信息
+            proposals[img_id] = proposals[img_id][img_sampled_inds]
+            # 获取对应正负样本的真实类别信息
+            labels[img_id] = labels[img_id][img_sampled_inds]
+            # 获取对应正负样本的gt索引信息
+            matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
+
+            gt_boxes_in_image = gt_boxes[img_id]
+            if gt_boxes_in_image.numel() == 0:
+                gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
+            # 获取对应正负样本的gt box信息
+            matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
+
+        # 根据gt和proposal计算边框回归参数（针对gt的）
+        regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
+        return proposals, matched_idxs, labels, regression_targets
+
+    def postprocess_detections(self,
+                               class_logits,    # type: Tensor
+                               box_regression,  # type: Tensor
+                               proposals,       # type: List[Tensor]
+                               image_shapes     # type: List[Tuple[int, int]]
+                               ):
+        # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
+        """
+        对网络的预测数据进行后处理，包括
+        （1）根据proposal以及预测的回归参数计算出最终bbox坐标
+        （2）对预测类别结果进行softmax处理
+        （3）裁剪预测的boxes信息，将越界的坐标调整到图片边界上
+        （4）移除所有背景信息
+        （5）移除低概率目标
+        （6）移除小尺寸目标
+        （7）执行nms处理，并按scores进行排序
+        （8）根据scores排序返回前topk个目标
+        Args:
+            class_logits: 网络预测类别概率信息
+            box_regression: 网络预测的边界框回归参数
+            proposals: rpn输出的proposal
+            image_shapes: 打包成batch前每张图像的宽高
+
+        Returns:
+
+        """
+        device = class_logits.device
+        # 预测目标类别数
+        num_classes = class_logits.shape[-1]
+
+        # 获取每张图像的预测bbox数量
+        boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
+        # 根据proposal以及预测的回归参数计算出最终bbox坐标
+        pred_boxes = self.box_coder.decode(box_regression, proposals)
+
+        # 对预测类别结果进行softmax处理
+        pred_scores = F.softmax(class_logits, -1)
+
+        # split boxes and scores per image
+        # 根据每张图像的预测bbox数量分割结果
+        pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
+        pred_scores_list = pred_scores.split(boxes_per_image, 0)
+
+        all_boxes = []
+        all_scores = []
+        all_labels = []
+        # 遍历每张图像预测信息
+        for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
+            # 裁剪预测的boxes信息，将越界的坐标调整到图片边界上
+            boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
+
+            # create labels for each prediction
+            labels = torch.arange(num_classes, device=device)
+            labels = labels.view(1, -1).expand_as(scores)
+
+            # remove prediction with the background label
+            # 移除索引为0的所有信息（0代表背景）
+            boxes = boxes[:, 1:]
+            scores = scores[:, 1:]
+            labels = labels[:, 1:]
+
+            # batch everything, by making every class prediction be a separate instance
+            boxes = boxes.reshape(-1, 4)
+            scores = scores.reshape(-1)
+            labels = labels.reshape(-1)
+
+            # remove low scoring boxes
+            # 移除低概率目标，self.scores_thresh=0.05
+            # gt: Computes input > other element-wise.
+            # inds = torch.nonzero(torch.gt(scores, self.score_thresh)).squeeze(1)
+            inds = torch.where(torch.gt(scores, self.score_thresh))[0]
+            boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
+
+            # remove empty boxes
+            # 移除小目标
+            keep = box_ops.remove_small_boxes(boxes, min_size=1.)
+            boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
+
+            # non-maximun suppression, independently done per class
+            # 执行nms处理，执行后的结果会按照scores从大到小进行排序返回
+            keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
+
+            # keep only topk scoring predictions
+            # 获取scores排在前topk个预测目标
+            keep = keep[:self.detection_per_img]
+            boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
+
+            all_boxes.append(boxes)
+            all_scores.append(scores)
+            all_labels.append(labels)
+
+        return all_boxes, all_scores, all_labels
+
+    def forward(self,
+                features,       # type: Dict[str, Tensor]
+                proposals,      # type: List[Tensor]
+                image_shapes,   # type: List[Tuple[int, int]]
+                targets=None    # type: Optional[List[Dict[str, Tensor]]]
+                ):
+        # type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
+        """
+        Arguments:
+            features (List[Tensor])
+            proposals (List[Tensor[N, 4]])
+            image_shapes (List[Tuple[H, W]])
+            targets (List[Dict])
+        """
+
+        # 检查targets的数据类型是否正确
+        if targets is not None:
+            for t in targets:
+                floating_point_types = (torch.float, torch.double, torch.half)
+                assert t["boxes"].dtype in floating_point_types, "target boxes must of float type"
+                assert t["labels"].dtype == torch.int64, "target labels must of int64 type"
+
+        if self.training:
+            # 划分正负样本，统计对应gt的标签以及边界框回归信息
+            proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
+        else:
+            labels = None
+            regression_targets = None
+            matched_idxs = None
+
+        # 将采集样本通过Multi-scale RoIAlign pooling层
+        # box_features_shape: [num_proposals, channel, height, width]
+        box_features = self.box_roi_pool(features, proposals, image_shapes)
+
+        # 通过roi_pooling后的两层全连接层
+        # box_features_shape: [num_proposals, representation_size]
+        box_features = self.box_head(box_features)
+
+        # 接着分别预测目标类别和边界框回归参数
+        class_logits, box_regression = self.box_predictor(box_features)
+
+        result: List[Dict[str, torch.Tensor]] = []
+        losses = {}
+        if self.training:
+            assert labels is not None and regression_targets is not None
+            loss_classifier, loss_box_reg = fastrcnn_loss(
+                class_logits, box_regression, labels, regression_targets)
+            losses = {
+                "loss_classifier": loss_classifier,
+                "loss_box_reg": loss_box_reg
+            }
+        else:
+            boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
+            num_images = len(boxes)
+            for i in range(num_images):
+                result.append(
+                    {
+                        "boxes": boxes[i],
+                        "labels": labels[i],
+                        "scores": scores[i],
+                    }
+                )
+
+        if self.has_mask():
+            mask_proposals = [p["boxes"] for p in result]  # 将最终预测的Boxes信息取出
+            if self.training:
+                # matched_idxs为每个proposal在正负样本匹配过程中得到的gt索引(背景的gt索引也默认设置成了0)
+                if matched_idxs is None:
+                    raise ValueError("if in training, matched_idxs should not be None")
+
+                # during training, only focus on positive boxes
+                num_images = len(proposals)
+                mask_proposals = []
+                pos_matched_idxs = []
+                for img_id in range(num_images):
+                    pos = torch.where(labels[img_id] > 0)[0]  # 寻找对应gt类别大于0，即正样本
+                    mask_proposals.append(proposals[img_id][pos])
+                    pos_matched_idxs.append(matched_idxs[img_id][pos])
+            else:
+                pos_matched_idxs = None
+
+            mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
+            mask_features = self.mask_head(mask_features)
+            mask_logits = self.mask_predictor(mask_features)
+
+            loss_mask = {}
+            if self.training:
+                if targets is None or pos_matched_idxs is None or mask_logits is None:
+                    raise ValueError("targets, pos_matched_idxs, mask_logits cannot be None when training")
+
+                gt_masks = [t["masks"] for t in targets]
+                gt_labels = [t["labels"] for t in targets]
+                rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
+                loss_mask = {"loss_mask": rcnn_loss_mask}
+            else:
+                labels = [r["labels"] for r in result]
+                mask_probs = maskrcnn_inference(mask_logits, labels)
+                for mask_prob, r in zip(mask_probs, result):
+                    r["masks"] = mask_prob
+
+            losses.update(loss_mask)
+
+        return result, losses
diff --git a/pytorch_object_detection/mask_rcnn/network_files/rpn_function.py b/pytorch_object_detection/mask_rcnn/network_files/rpn_function.py
new file mode 100644
index 000000000..b18689884
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/rpn_function.py
@@ -0,0 +1,643 @@
+from typing import List, Optional, Dict, Tuple
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+import torchvision
+
+from . import det_utils
+from . import boxes as box_ops
+from .image_list import ImageList
+
+
+@torch.jit.unused
+def _onnx_get_num_anchors_and_pre_nms_top_n(ob, orig_pre_nms_top_n):
+    # type: (Tensor, int) -> Tuple[int, int]
+    from torch.onnx import operators
+    num_anchors = operators.shape_as_tensor(ob)[1].unsqueeze(0)
+    pre_nms_top_n = torch.min(torch.cat(
+        (torch.tensor([orig_pre_nms_top_n], dtype=num_anchors.dtype),
+         num_anchors), 0))
+
+    return num_anchors, pre_nms_top_n
+
+
+class AnchorsGenerator(nn.Module):
+    __annotations__ = {
+        "cell_anchors": Optional[List[torch.Tensor]],
+        "_cache": Dict[str, List[torch.Tensor]]
+    }
+
+    """
+    anchors生成器
+    Module that generates anchors for a set of feature maps and
+    image sizes.
+
+    The module support computing anchors at multiple sizes and aspect ratios
+    per feature map.
+
+    sizes and aspect_ratios should have the same number of elements, and it should
+    correspond to the number of feature maps.
+
+    sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
+    and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
+    per spatial location for feature map i.
+
+    Arguments:
+        sizes (Tuple[Tuple[int]]):
+        aspect_ratios (Tuple[Tuple[float]]):
+    """
+
+    def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)):
+        super(AnchorsGenerator, self).__init__()
+
+        if not isinstance(sizes[0], (list, tuple)):
+            # TODO change this
+            sizes = tuple((s,) for s in sizes)
+        if not isinstance(aspect_ratios[0], (list, tuple)):
+            aspect_ratios = (aspect_ratios,) * len(sizes)
+
+        assert len(sizes) == len(aspect_ratios)
+
+        self.sizes = sizes
+        self.aspect_ratios = aspect_ratios
+        self.cell_anchors = None
+        self._cache = {}
+
+    def generate_anchors(self, scales, aspect_ratios, dtype=torch.float32, device=torch.device("cpu")):
+        # type: (List[int], List[float], torch.dtype, torch.device) -> Tensor
+        """
+        compute anchor sizes
+        Arguments:
+            scales: sqrt(anchor_area)
+            aspect_ratios: h/w ratios
+            dtype: float32
+            device: cpu/gpu
+        """
+        scales = torch.as_tensor(scales, dtype=dtype, device=device)
+        aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
+        h_ratios = torch.sqrt(aspect_ratios)
+        w_ratios = 1.0 / h_ratios
+
+        # [r1, r2, r3]' * [s1, s2, s3]
+        # number of elements is len(ratios)*len(scales)
+        ws = (w_ratios[:, None] * scales[None, :]).view(-1)
+        hs = (h_ratios[:, None] * scales[None, :]).view(-1)
+
+        # left-top, right-bottom coordinate relative to anchor center(0, 0)
+        # 生成的anchors模板都是以（0, 0）为中心的, shape [len(ratios)*len(scales), 4]
+        base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
+
+        return base_anchors.round()  # round 四舍五入
+
+    def set_cell_anchors(self, dtype, device):
+        # type: (torch.dtype, torch.device) -> None
+        if self.cell_anchors is not None:
+            cell_anchors = self.cell_anchors
+            assert cell_anchors is not None
+            # suppose that all anchors have the same device
+            # which is a valid assumption in the current state of the codebase
+            if cell_anchors[0].device == device:
+                return
+
+        # 根据提供的sizes和aspect_ratios生成anchors模板
+        # anchors模板都是以(0, 0)为中心的anchor
+        cell_anchors = [
+            self.generate_anchors(sizes, aspect_ratios, dtype, device)
+            for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios)
+        ]
+        self.cell_anchors = cell_anchors
+
+    def num_anchors_per_location(self):
+        # 计算每个预测特征层上每个滑动窗口的预测目标数
+        return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
+
+    # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
+    # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
+    def grid_anchors(self, grid_sizes, strides):
+        # type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
+        """
+        anchors position in grid coordinate axis map into origin image
+        计算预测特征图对应原始图像上的所有anchors的坐标
+        Args:
+            grid_sizes: 预测特征矩阵的height和width
+            strides: 预测特征矩阵上一步对应原始图像上的步距
+        """
+        anchors = []
+        cell_anchors = self.cell_anchors
+        assert cell_anchors is not None
+
+        # 遍历每个预测特征层的grid_size，strides和cell_anchors
+        for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
+            grid_height, grid_width = size
+            stride_height, stride_width = stride
+            device = base_anchors.device
+
+            # For output anchor, compute [x_center, y_center, x_center, y_center]
+            # shape: [grid_width] 对应原图上的x坐标(列)
+            shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) * stride_width
+            # shape: [grid_height] 对应原图上的y坐标(行)
+            shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) * stride_height
+
+            # 计算预测特征矩阵上每个点对应原图上的坐标(anchors模板的坐标偏移量)
+            # torch.meshgrid函数分别传入行坐标和列坐标，生成网格行坐标矩阵和网格列坐标矩阵
+            # shape: [grid_height, grid_width]
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+            shift_x = shift_x.reshape(-1)
+            shift_y = shift_y.reshape(-1)
+
+            # 计算anchors坐标(xmin, ymin, xmax, ymax)在原图上的坐标偏移量
+            # shape: [grid_width*grid_height, 4]
+            shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
+
+            # For every (base anchor, output anchor) pair,
+            # offset each zero-centered base anchor by the center of the output anchor.
+            # 将anchors模板与原图上的坐标偏移量相加得到原图上所有anchors的坐标信息(shape不同时会使用广播机制)
+            shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
+            anchors.append(shifts_anchor.reshape(-1, 4))
+
+        return anchors  # List[Tensor(all_num_anchors, 4)]
+
+    def cached_grid_anchors(self, grid_sizes, strides):
+        # type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
+        """将计算得到的所有anchors信息进行缓存"""
+        key = str(grid_sizes) + str(strides)
+        # self._cache是字典类型
+        if key in self._cache:
+            return self._cache[key]
+        anchors = self.grid_anchors(grid_sizes, strides)
+        self._cache[key] = anchors
+        return anchors
+
+    def forward(self, image_list, feature_maps):
+        # type: (ImageList, List[Tensor]) -> List[Tensor]
+        # 获取每个预测特征层的尺寸(height, width)
+        grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
+
+        # 获取输入图像的height和width
+        image_size = image_list.tensors.shape[-2:]
+
+        # 获取变量类型和设备类型
+        dtype, device = feature_maps[0].dtype, feature_maps[0].device
+
+        # one step in feature map equate n pixel stride in origin image
+        # 计算特征层上的一步等于原始图像上的步长
+        strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
+                    torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
+
+        # 根据提供的sizes和aspect_ratios生成anchors模板
+        self.set_cell_anchors(dtype, device)
+
+        # 计算/读取所有anchors的坐标信息（这里的anchors信息是映射到原图上的所有anchors信息，不是anchors模板）
+        # 得到的是一个list列表，对应每张预测特征图映射回原图的anchors坐标信息
+        anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides)
+
+        anchors = torch.jit.annotate(List[List[torch.Tensor]], [])
+        # 遍历一个batch中的每张图像
+        for i, (image_height, image_width) in enumerate(image_list.image_sizes):
+            anchors_in_image = []
+            # 遍历每张预测特征图映射回原图的anchors坐标信息
+            for anchors_per_feature_map in anchors_over_all_feature_maps:
+                anchors_in_image.append(anchors_per_feature_map)
+            anchors.append(anchors_in_image)
+        # 将每一张图像的所有预测特征层的anchors坐标信息拼接在一起
+        # anchors是个list，每个元素为一张图像的所有anchors信息
+        anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
+        # Clear the cache in case that memory leaks.
+        self._cache.clear()
+        return anchors
+
+
+class RPNHead(nn.Module):
+    """
+    add a RPN head with classification and regression
+    通过滑动窗口计算预测目标概率与bbox regression参数
+
+    Arguments:
+        in_channels: number of channels of the input feature
+        num_anchors: number of anchors to be predicted
+    """
+
+    def __init__(self, in_channels, num_anchors):
+        super(RPNHead, self).__init__()
+        # 3x3 滑动窗口
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+        # 计算预测的目标分数（这里的目标只是指前景或者背景）
+        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
+        # 计算预测的目标bbox regression参数
+        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
+
+        for layer in self.children():
+            if isinstance(layer, nn.Conv2d):
+                torch.nn.init.normal_(layer.weight, std=0.01)
+                torch.nn.init.constant_(layer.bias, 0)
+
+    def forward(self, x):
+        # type: (List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
+        logits = []
+        bbox_reg = []
+        for i, feature in enumerate(x):
+            t = F.relu(self.conv(feature))
+            logits.append(self.cls_logits(t))
+            bbox_reg.append(self.bbox_pred(t))
+        return logits, bbox_reg
+
+
+def permute_and_flatten(layer, N, A, C, H, W):
+    # type: (Tensor, int, int, int, int, int) -> Tensor
+    """
+    调整tensor顺序，并进行reshape
+    Args:
+        layer: 预测特征层上预测的目标概率或bboxes regression参数
+        N: batch_size
+        A: anchors_num_per_position
+        C: classes_num or 4(bbox coordinate)
+        H: height
+        W: width
+
+    Returns:
+        layer: 调整tensor顺序，并reshape后的结果[N, -1, C]
+    """
+    # view和reshape功能是一样的，先展平所有元素在按照给定shape排列
+    # view函数只能用于内存中连续存储的tensor，permute等操作会使tensor在内存中变得不再连续，此时就不能再调用view函数
+    # reshape则不需要依赖目标tensor是否在内存中是连续的
+    # [batch_size, anchors_num_per_position * (C or 4), height, width]
+    layer = layer.view(N, -1, C,  H, W)
+    # 调换tensor维度
+    layer = layer.permute(0, 3, 4, 1, 2)  # [N, H, W, -1, C]
+    layer = layer.reshape(N, -1, C)
+    return layer
+
+
+def concat_box_prediction_layers(box_cls, box_regression):
+    # type: (List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
+    """
+    对box_cla和box_regression两个list中的每个预测特征层的预测信息
+    的tensor排列顺序以及shape进行调整 -> [N, -1, C]
+    Args:
+        box_cls: 每个预测特征层上的预测目标概率
+        box_regression: 每个预测特征层上的预测目标bboxes regression参数
+
+    Returns:
+
+    """
+    box_cls_flattened = []
+    box_regression_flattened = []
+
+    # 遍历每个预测特征层
+    for box_cls_per_level, box_regression_per_level in zip(box_cls, box_regression):
+        # [batch_size, anchors_num_per_position * classes_num, height, width]
+        # 注意，当计算RPN中的proposal时，classes_num=1,只区分目标和背景
+        N, AxC, H, W = box_cls_per_level.shape
+        # # [batch_size, anchors_num_per_position * 4, height, width]
+        Ax4 = box_regression_per_level.shape[1]
+        # anchors_num_per_position
+        A = Ax4 // 4
+        # classes_num
+        C = AxC // A
+
+        # [N, -1, C]
+        box_cls_per_level = permute_and_flatten(box_cls_per_level, N, A, C, H, W)
+        box_cls_flattened.append(box_cls_per_level)
+
+        # [N, -1, C]
+        box_regression_per_level = permute_and_flatten(box_regression_per_level, N, A, 4, H, W)
+        box_regression_flattened.append(box_regression_per_level)
+
+    box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2)  # start_dim, end_dim
+    box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4)
+    return box_cls, box_regression
+
+
+class RegionProposalNetwork(torch.nn.Module):
+    """
+    Implements Region Proposal Network (RPN).
+
+    Arguments:
+        anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps.
+        head (nn.Module): module that computes the objectness and regression deltas
+        fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training of the RPN.
+        bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+            considered as negative during training of the RPN.
+        batch_size_per_image (int): number of anchors that are sampled during training of the RPN
+            for computing the loss
+        positive_fraction (float): proportion of positive anchors in a mini-batch during training
+            of the RPN
+        pre_nms_top_n (Dict[str]): number of proposals to keep before applying NMS. It should
+            contain two fields: training and testing, to allow for different values depending
+            on training or evaluation
+        post_nms_top_n (Dict[str]): number of proposals to keep after applying NMS. It should
+            contain two fields: training and testing, to allow for different values depending
+            on training or evaluation
+        nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
+
+    """
+    __annotations__ = {
+        'box_coder': det_utils.BoxCoder,
+        'proposal_matcher': det_utils.Matcher,
+        'fg_bg_sampler': det_utils.BalancedPositiveNegativeSampler,
+        'pre_nms_top_n': Dict[str, int],
+        'post_nms_top_n': Dict[str, int],
+    }
+
+    def __init__(self, anchor_generator, head,
+                 fg_iou_thresh, bg_iou_thresh,
+                 batch_size_per_image, positive_fraction,
+                 pre_nms_top_n, post_nms_top_n, nms_thresh, score_thresh=0.0):
+        super(RegionProposalNetwork, self).__init__()
+        self.anchor_generator = anchor_generator
+        self.head = head
+        self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+
+        # use during training
+        # 计算anchors与真实bbox的iou
+        self.box_similarity = box_ops.box_iou
+
+        self.proposal_matcher = det_utils.Matcher(
+            fg_iou_thresh,  # 当iou大于fg_iou_thresh(0.7)时视为正样本
+            bg_iou_thresh,  # 当iou小于bg_iou_thresh(0.3)时视为负样本
+            allow_low_quality_matches=True
+        )
+
+        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
+            batch_size_per_image, positive_fraction  # 256, 0.5
+        )
+
+        # use during testing
+        self._pre_nms_top_n = pre_nms_top_n
+        self._post_nms_top_n = post_nms_top_n
+        self.nms_thresh = nms_thresh
+        self.score_thresh = score_thresh
+        self.min_size = 1.
+
+    def pre_nms_top_n(self):
+        if self.training:
+            return self._pre_nms_top_n['training']
+        return self._pre_nms_top_n['testing']
+
+    def post_nms_top_n(self):
+        if self.training:
+            return self._post_nms_top_n['training']
+        return self._post_nms_top_n['testing']
+
+    def assign_targets_to_anchors(self, anchors, targets):
+        # type: (List[Tensor], List[Dict[str, Tensor]]) -> Tuple[List[Tensor], List[Tensor]]
+        """
+        计算每个anchors最匹配的gt，并划分为正样本，背景以及废弃的样本
+        Args：
+            anchors: (List[Tensor])
+            targets: (List[Dict[Tensor])
+        Returns:
+            labels: 标记anchors归属类别（1, 0, -1分别对应正样本，背景，废弃的样本）
+                    注意，在RPN中只有前景和背景，所有正样本的类别都是1，0代表背景
+            matched_gt_boxes：与anchors匹配的gt
+        """
+        labels = []
+        matched_gt_boxes = []
+        # 遍历每张图像的anchors和targets
+        for anchors_per_image, targets_per_image in zip(anchors, targets):
+            gt_boxes = targets_per_image["boxes"]
+            if gt_boxes.numel() == 0:
+                device = anchors_per_image.device
+                matched_gt_boxes_per_image = torch.zeros(anchors_per_image.shape, dtype=torch.float32, device=device)
+                labels_per_image = torch.zeros((anchors_per_image.shape[0],), dtype=torch.float32, device=device)
+            else:
+                # 计算anchors与真实bbox的iou信息
+                # set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
+                match_quality_matrix = box_ops.box_iou(gt_boxes, anchors_per_image)
+                # 计算每个anchors与gt匹配iou最大的索引（如果iou<0.3索引置为-1，0.3<iou<0.7索引为-2）
+                matched_idxs = self.proposal_matcher(match_quality_matrix)
+                # get the targets corresponding GT for each proposal
+                # NB: need to clamp the indices because we can have a single
+                # GT in the image, and matched_idxs can be -2, which goes
+                # out of bounds
+                matched_gt_boxes_per_image = gt_boxes[matched_idxs.clamp(min=0)]
+
+                labels_per_image = matched_idxs >= 0
+                labels_per_image = labels_per_image.to(dtype=torch.float32)
+
+                # background (negative examples)
+                bg_indices = matched_idxs == self.proposal_matcher.BELOW_LOW_THRESHOLD  # -1
+                labels_per_image[bg_indices] = 0.0
+
+                # discard indices that are between thresholds
+                inds_to_discard = matched_idxs == self.proposal_matcher.BETWEEN_THRESHOLDS  # -2
+                labels_per_image[inds_to_discard] = -1.0
+
+            labels.append(labels_per_image)
+            matched_gt_boxes.append(matched_gt_boxes_per_image)
+        return labels, matched_gt_boxes
+
+    def _get_top_n_idx(self, objectness, num_anchors_per_level):
+        # type: (Tensor, List[int]) -> Tensor
+        """
+        获取每张预测特征图上预测概率排前pre_nms_top_n的anchors索引值
+        Args:
+            objectness: Tensor(每张图像的预测目标概率信息 )
+            num_anchors_per_level: List（每个预测特征层上的预测的anchors个数）
+        Returns:
+
+        """
+        r = []  # 记录每个预测特征层上预测目标概率前pre_nms_top_n的索引信息
+        offset = 0
+        # 遍历每个预测特征层上的预测目标概率信息
+        for ob in objectness.split(num_anchors_per_level, 1):
+            if torchvision._is_tracing():
+                num_anchors, pre_nms_top_n = _onnx_get_num_anchors_and_pre_nms_top_n(ob, self.pre_nms_top_n())
+            else:
+                num_anchors = ob.shape[1]  # 预测特征层上的预测的anchors个数
+                pre_nms_top_n = min(self.pre_nms_top_n(), num_anchors)
+
+            # Returns the k largest elements of the given input tensor along a given dimension
+            _, top_n_idx = ob.topk(pre_nms_top_n, dim=1)
+            r.append(top_n_idx + offset)
+            offset += num_anchors
+        return torch.cat(r, dim=1)
+
+    def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level):
+        # type: (Tensor, Tensor, List[Tuple[int, int]], List[int]) -> Tuple[List[Tensor], List[Tensor]]
+        """
+        筛除小boxes框，nms处理，根据预测概率获取前post_nms_top_n个目标
+        Args:
+            proposals: 预测的bbox坐标
+            objectness: 预测的目标概率
+            image_shapes: batch中每张图片的size信息
+            num_anchors_per_level: 每个预测特征层上预测anchors的数目
+
+        Returns:
+
+        """
+        num_images = proposals.shape[0]
+        device = proposals.device
+
+        # do not backprop throught objectness
+        objectness = objectness.detach()
+        objectness = objectness.reshape(num_images, -1)
+
+        # Returns a tensor of size size filled with fill_value
+        # levels负责记录分隔不同预测特征层上的anchors索引信息
+        levels = [torch.full((n, ), idx, dtype=torch.int64, device=device)
+                  for idx, n in enumerate(num_anchors_per_level)]
+        levels = torch.cat(levels, 0)
+
+        # Expand this tensor to the same size as objectness
+        levels = levels.reshape(1, -1).expand_as(objectness)
+
+        # select top_n boxes independently per level before applying nms
+        # 获取每张预测特征图上预测概率排前pre_nms_top_n的anchors索引值
+        top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)
+
+        image_range = torch.arange(num_images, device=device)
+        batch_idx = image_range[:, None]  # [batch_size, 1]
+
+        # 根据每个预测特征层预测概率排前pre_nms_top_n的anchors索引值获取相应概率信息
+        objectness = objectness[batch_idx, top_n_idx]
+        levels = levels[batch_idx, top_n_idx]
+        # 预测概率排前pre_nms_top_n的anchors索引值获取相应bbox坐标信息
+        proposals = proposals[batch_idx, top_n_idx]
+
+        objectness_prob = torch.sigmoid(objectness)
+
+        final_boxes = []
+        final_scores = []
+        # 遍历每张图像的相关预测信息
+        for boxes, scores, lvl, img_shape in zip(proposals, objectness_prob, levels, image_shapes):
+            # 调整预测的boxes信息，将越界的坐标调整到图片边界上
+            boxes = box_ops.clip_boxes_to_image(boxes, img_shape)
+
+            # 返回boxes满足宽，高都大于min_size的索引
+            keep = box_ops.remove_small_boxes(boxes, self.min_size)
+            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
+
+            # 移除小概率boxes，参考下面这个链接
+            # https://github.com/pytorch/vision/pull/3205
+            keep = torch.where(torch.ge(scores, self.score_thresh))[0]  # ge: >=
+            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
+
+            # non-maximum suppression, independently done per level
+            keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh)
+
+            # keep only topk scoring predictions
+            keep = keep[: self.post_nms_top_n()]
+            boxes, scores = boxes[keep], scores[keep]
+
+            final_boxes.append(boxes)
+            final_scores.append(scores)
+        return final_boxes, final_scores
+
+    def compute_loss(self, objectness, pred_bbox_deltas, labels, regression_targets):
+        # type: (Tensor, Tensor, List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
+        """
+        计算RPN损失，包括类别损失（前景与背景），bbox regression损失
+        Arguments:
+            objectness (Tensor)：预测的前景概率
+            pred_bbox_deltas (Tensor)：预测的bbox regression
+            labels (List[Tensor])：真实的标签 1, 0, -1（batch中每一张图片的labels对应List的一个元素中）
+            regression_targets (List[Tensor])：真实的bbox regression
+
+        Returns:
+            objectness_loss (Tensor) : 类别损失
+            box_loss (Tensor)：边界框回归损失
+        """
+        # 按照给定的batch_size_per_image, positive_fraction选择正负样本
+        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+        # 将一个batch中的所有正负样本List(Tensor)分别拼接在一起，并获取非零位置的索引
+        # sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
+        sampled_pos_inds = torch.where(torch.cat(sampled_pos_inds, dim=0))[0]
+        # sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
+        sampled_neg_inds = torch.where(torch.cat(sampled_neg_inds, dim=0))[0]
+
+        # 将所有正负样本索引拼接在一起
+        sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
+        objectness = objectness.flatten()
+
+        labels = torch.cat(labels, dim=0)
+        regression_targets = torch.cat(regression_targets, dim=0)
+
+        # 计算边界框回归损失
+        box_loss = det_utils.smooth_l1_loss(
+            pred_bbox_deltas[sampled_pos_inds],
+            regression_targets[sampled_pos_inds],
+            beta=1 / 9,
+            size_average=False,
+        ) / (sampled_inds.numel())
+
+        # 计算目标预测概率损失
+        objectness_loss = F.binary_cross_entropy_with_logits(
+            objectness[sampled_inds], labels[sampled_inds]
+        )
+
+        return objectness_loss, box_loss
+
+    def forward(self,
+                images,        # type: ImageList
+                features,      # type: Dict[str, Tensor]
+                targets=None   # type: Optional[List[Dict[str, Tensor]]]
+                ):
+        # type: (...) -> Tuple[List[Tensor], Dict[str, Tensor]]
+        """
+        Arguments:
+            images (ImageList): images for which we want to compute the predictions
+            features (Dict[Tensor]): features computed from the images that are
+                used for computing the predictions. Each tensor in the list
+                correspond to different feature levels
+            targets (List[Dict[Tensor]): ground-truth boxes present in the image (optional).
+                If provided, each element in the dict should contain a field `boxes`,
+                with the locations of the ground-truth boxes.
+
+        Returns:
+            boxes (List[Tensor]): the predicted boxes from the RPN, one Tensor per
+                image.
+            losses (Dict[Tensor]): the losses for the model during training. During
+                testing, it is an empty dict.
+        """
+        # RPN uses all feature maps that are available
+        # features是所有预测特征层组成的OrderedDict
+        features = list(features.values())
+
+        # 计算每个预测特征层上的预测目标概率和bboxes regression参数
+        # objectness和pred_bbox_deltas都是list
+        objectness, pred_bbox_deltas = self.head(features)
+
+        # 生成一个batch图像的所有anchors信息,list(tensor)元素个数等于batch_size
+        anchors = self.anchor_generator(images, features)
+
+        # batch_size
+        num_images = len(anchors)
+
+        # numel() Returns the total number of elements in the input tensor.
+        # 计算每个预测特征层上的对应的anchors数量
+        num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
+        num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
+
+        # 调整内部tensor格式以及shape
+        objectness, pred_bbox_deltas = concat_box_prediction_layers(objectness,
+                                                                    pred_bbox_deltas)
+
+        # apply pred_bbox_deltas to anchors to obtain the decoded proposals
+        # note that we detach the deltas because Faster R-CNN do not backprop through
+        # the proposals
+        # 将预测的bbox regression参数应用到anchors上得到最终预测bbox坐标
+        proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
+        proposals = proposals.view(num_images, -1, 4)
+
+        # 筛除小boxes框，nms处理，根据预测概率获取前post_nms_top_n个目标
+        boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)
+
+        losses = {}
+        if self.training:
+            assert targets is not None
+            # 计算每个anchors最匹配的gt，并将anchors进行分类，前景，背景以及废弃的anchors
+            labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets)
+            # 结合anchors以及对应的gt，计算regression参数
+            regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)
+            loss_objectness, loss_rpn_box_reg = self.compute_loss(
+                objectness, pred_bbox_deltas, labels, regression_targets
+            )
+            losses = {
+                "loss_objectness": loss_objectness,
+                "loss_rpn_box_reg": loss_rpn_box_reg
+            }
+        return boxes, losses
diff --git a/pytorch_object_detection/mask_rcnn/network_files/transform.py b/pytorch_object_detection/mask_rcnn/network_files/transform.py
new file mode 100644
index 000000000..420d8ed0e
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/network_files/transform.py
@@ -0,0 +1,490 @@
+import math
+from typing import List, Tuple, Dict, Optional
+
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+import torchvision
+
+from .image_list import ImageList
+
+
+def _onnx_paste_mask_in_image(mask, box, im_h, im_w):
+    one = torch.ones(1, dtype=torch.int64)
+    zero = torch.zeros(1, dtype=torch.int64)
+
+    w = box[2] - box[0] + one
+    h = box[3] - box[1] + one
+    w = torch.max(torch.cat((w, one)))
+    h = torch.max(torch.cat((h, one)))
+
+    # Set shape to [batchxCxHxW]
+    mask = mask.expand((1, 1, mask.size(0), mask.size(1)))
+
+    # Resize mask
+    mask = F.interpolate(mask, size=(int(h), int(w)), mode="bilinear", align_corners=False)
+    mask = mask[0][0]
+
+    x_0 = torch.max(torch.cat((box[0].unsqueeze(0), zero)))
+    x_1 = torch.min(torch.cat((box[2].unsqueeze(0) + one, im_w.unsqueeze(0))))
+    y_0 = torch.max(torch.cat((box[1].unsqueeze(0), zero)))
+    y_1 = torch.min(torch.cat((box[3].unsqueeze(0) + one, im_h.unsqueeze(0))))
+
+    unpaded_im_mask = mask[(y_0 - box[1]): (y_1 - box[1]), (x_0 - box[0]): (x_1 - box[0])]
+
+    # TODO : replace below with a dynamic padding when support is added in ONNX
+
+    # pad y
+    zeros_y0 = torch.zeros(y_0, unpaded_im_mask.size(1))
+    zeros_y1 = torch.zeros(im_h - y_1, unpaded_im_mask.size(1))
+    concat_0 = torch.cat((zeros_y0, unpaded_im_mask.to(dtype=torch.float32), zeros_y1), 0)[0:im_h, :]
+    # pad x
+    zeros_x0 = torch.zeros(concat_0.size(0), x_0)
+    zeros_x1 = torch.zeros(concat_0.size(0), im_w - x_1)
+    im_mask = torch.cat((zeros_x0, concat_0, zeros_x1), 1)[:, :im_w]
+    return im_mask
+
+
+@torch.jit._script_if_tracing
+def _onnx_paste_mask_in_image_loop(masks, boxes, im_h, im_w):
+    res_append = torch.zeros(0, im_h, im_w)
+    for i in range(masks.size(0)):
+        mask_res = _onnx_paste_mask_in_image(masks[i][0], boxes[i], im_h, im_w)
+        mask_res = mask_res.unsqueeze(0)
+        res_append = torch.cat((res_append, mask_res))
+
+    return res_append
+
+
+@torch.jit.unused
+def _get_shape_onnx(image: Tensor) -> Tensor:
+    from torch.onnx import operators
+
+    return operators.shape_as_tensor(image)[-2:]
+
+
+@torch.jit.unused
+def _fake_cast_onnx(v: Tensor) -> float:
+    # ONNX requires a tensor but here we fake its type for JIT.
+    return v
+
+
+def _resize_image_and_masks(image: Tensor,
+                            self_min_size: float,
+                            self_max_size: float,
+                            target: Optional[Dict[str, Tensor]] = None,
+                            fixed_size: Optional[Tuple[int, int]] = None
+                            ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+
+    if torchvision._is_tracing():
+        im_shape = _get_shape_onnx(image)
+    else:
+        im_shape = torch.tensor(image.shape[-2:])
+
+    size: Optional[List[int]] = None
+    scale_factor: Optional[float] = None
+    recompute_scale_factor: Optional[bool] = None
+    if fixed_size is not None:
+        size = [fixed_size[1], fixed_size[0]]
+    else:
+        min_size = torch.min(im_shape).to(dtype=torch.float32)  # 获取高宽中的最小值
+        max_size = torch.max(im_shape).to(dtype=torch.float32)  # 获取高宽中的最大值
+        scale = torch.min(self_min_size / min_size, self_max_size / max_size)  # 计算缩放比例
+
+        if torchvision._is_tracing():
+            scale_factor = _fake_cast_onnx(scale)
+        else:
+            scale_factor = scale.item()
+        recompute_scale_factor = True
+
+    # interpolate利用插值的方法缩放图片
+    # image[None]操作是在最前面添加batch维度[C, H, W] -> [1, C, H, W]
+    # bilinear只支持4D Tensor
+    image = torch.nn.functional.interpolate(
+        image[None],
+        size=size,
+        scale_factor=scale_factor,
+        mode="bilinear",
+        recompute_scale_factor=recompute_scale_factor,
+        align_corners=False)[0]
+
+    if target is None:
+        return image, target
+
+    if "masks" in target:
+        mask = target["masks"]
+        mask = torch.nn.functional.interpolate(
+            mask[:, None].float(), size=size, scale_factor=scale_factor, recompute_scale_factor=recompute_scale_factor
+        )[:, 0].byte()  # self.byte() is equivalent to self.to(torch.uint8).
+        target["masks"] = mask
+
+    return image, target
+
+
+def _onnx_expand_boxes(boxes, scale):
+    # type: (Tensor, float) -> Tensor
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+
+    w_half = w_half.to(dtype=torch.float32) * scale
+    h_half = h_half.to(dtype=torch.float32) * scale
+
+    boxes_exp0 = x_c - w_half
+    boxes_exp1 = y_c - h_half
+    boxes_exp2 = x_c + w_half
+    boxes_exp3 = y_c + h_half
+    boxes_exp = torch.stack((boxes_exp0, boxes_exp1, boxes_exp2, boxes_exp3), 1)
+    return boxes_exp
+
+
+# the next two functions should be merged inside Masker
+# but are kept here for the moment while we need them
+# temporarily for paste_mask_in_image
+def expand_boxes(boxes, scale):
+    # type: (Tensor, float) -> Tensor
+    if torchvision._is_tracing():
+        return _onnx_expand_boxes(boxes, scale)
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = torch.zeros_like(boxes)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+    return boxes_exp
+
+
+@torch.jit.unused
+def expand_masks_tracing_scale(M, padding):
+    # type: (int, int) -> float
+    return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
+
+
+def expand_masks(mask, padding):
+    # type: (Tensor, int) -> Tuple[Tensor, float]
+    M = mask.shape[-1]
+    if torch._C._get_tracing_state():  # could not import is_tracing(), not sure why
+        scale = expand_masks_tracing_scale(M, padding)
+    else:
+        scale = float(M + 2 * padding) / M
+    padded_mask = F.pad(mask, (padding,) * 4)
+    return padded_mask, scale
+
+
+def paste_mask_in_image(mask, box, im_h, im_w):
+    # type: (Tensor, Tensor, int, int) -> Tensor
+
+    # refer to: https://github.com/pytorch/vision/issues/5845
+    TO_REMOVE = 1
+    w = int(box[2] - box[0] + TO_REMOVE)
+    h = int(box[3] - box[1] + TO_REMOVE)
+    w = max(w, 1)
+    h = max(h, 1)
+
+    # Set shape to [batch, C, H, W]
+    # 因为后续的bilinear操作只支持4-D的Tensor
+    mask = mask.expand((1, 1, -1, -1))  # -1 means not changing the size of that dimension
+
+    # Resize mask
+    mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
+    mask = mask[0][0]  # [batch, C, H, W] -> [H, W]
+
+    im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
+    # 填入原图的目标区域(防止越界)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, im_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, im_h)
+
+    # 将resize后的mask填入对应目标区域
+    im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - box[1]):(y_1 - box[1]), (x_0 - box[0]):(x_1 - box[0])]
+    return im_mask
+
+
+def paste_masks_in_image(masks, boxes, img_shape, padding=1):
+    # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
+
+    # pytorch官方说对mask进行expand能够略微提升mAP
+    # refer to: https://github.com/pytorch/vision/issues/5845
+    masks, scale = expand_masks(masks, padding=padding)
+    boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
+    im_h, im_w = img_shape
+
+    if torchvision._is_tracing():
+        return _onnx_paste_mask_in_image_loop(
+            masks, boxes, torch.scalar_tensor(im_h, dtype=torch.int64), torch.scalar_tensor(im_w, dtype=torch.int64)
+        )[:, None]
+    res = [paste_mask_in_image(m[0], b, im_h, im_w) for m, b in zip(masks, boxes)]
+    if len(res) > 0:
+        ret = torch.stack(res, dim=0)[:, None]  # [num_obj, 1, H, W]
+    else:
+        ret = masks.new_empty((0, 1, im_h, im_w))
+    return ret
+
+
+class GeneralizedRCNNTransform(nn.Module):
+    """
+    Performs input / target transformation before feeding the data to a GeneralizedRCNN
+    model.
+
+    The transformations it perform are:
+        - input normalization (mean subtraction and std division)
+        - input / target resizing to match min_size / max_size
+
+    It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
+    """
+
+    def __init__(self,
+                 min_size: int,
+                 max_size: int,
+                 image_mean: List[float],
+                 image_std: List[float],
+                 size_divisible: int = 32,
+                 fixed_size: Optional[Tuple[int, int]] = None):
+        super().__init__()
+        if not isinstance(min_size, (list, tuple)):
+            min_size = (min_size,)
+        self.min_size = min_size      # 指定图像的最小边长范围
+        self.max_size = max_size      # 指定图像的最大边长范围
+        self.image_mean = image_mean  # 指定图像在标准化处理中的均值
+        self.image_std = image_std    # 指定图像在标准化处理中的方差
+        self.size_divisible = size_divisible
+        self.fixed_size = fixed_size
+
+    def normalize(self, image):
+        """标准化处理"""
+        dtype, device = image.dtype, image.device
+        mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device)
+        std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
+        # [:, None, None]: shape [3] -> [3, 1, 1]
+        return (image - mean[:, None, None]) / std[:, None, None]
+
+    def torch_choice(self, k):
+        # type: (List[int]) -> int
+        """
+        Implements `random.choice` via torch ops so it can be compiled with
+        TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
+        is fixed.
+        """
+        index = int(torch.empty(1).uniform_(0., float(len(k))).item())
+        return k[index]
+
+    def resize(self, image, target):
+        # type: (Tensor, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]
+        """
+        将图片缩放到指定的大小范围内，并对应缩放bboxes信息
+        Args:
+            image: 输入的图片
+            target: 输入图片的相关信息（包括bboxes信息）
+
+        Returns:
+            image: 缩放后的图片
+            target: 缩放bboxes后的图片相关信息
+        """
+        # image shape is [channel, height, width]
+        h, w = image.shape[-2:]
+
+        if self.training:
+            size = float(self.torch_choice(self.min_size))  # 指定输入图片的最小边长,注意是self.min_size不是min_size
+        else:
+            # FIXME assume for now that testing uses the largest scale
+            size = float(self.min_size[-1])    # 指定输入图片的最小边长,注意是self.min_size不是min_size
+
+        image, target = _resize_image_and_masks(image, size, float(self.max_size), target, self.fixed_size)
+
+        if target is None:
+            return image, target
+
+        bbox = target["boxes"]
+        # 根据图像的缩放比例来缩放bbox
+        bbox = resize_boxes(bbox, [h, w], image.shape[-2:])
+        target["boxes"] = bbox
+
+        return image, target
+
+    # _onnx_batch_images() is an implementation of
+    # batch_images() that is supported by ONNX tracing.
+    @torch.jit.unused
+    def _onnx_batch_images(self, images, size_divisible=32):
+        # type: (List[Tensor], int) -> Tensor
+        max_size = []
+        for i in range(images[0].dim()):
+            max_size_i = torch.max(torch.stack([img.shape[i] for img in images]).to(torch.float32)).to(torch.int64)
+            max_size.append(max_size_i)
+        stride = size_divisible
+        max_size[1] = (torch.ceil((max_size[1].to(torch.float32)) / stride) * stride).to(torch.int64)
+        max_size[2] = (torch.ceil((max_size[2].to(torch.float32)) / stride) * stride).to(torch.int64)
+        max_size = tuple(max_size)
+
+        # work around for
+        # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+        # which is not yet supported in onnx
+        padded_imgs = []
+        for img in images:
+            padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+            padded_img = torch.nn.functional.pad(img, [0, padding[2], 0, padding[1], 0, padding[0]])
+            padded_imgs.append(padded_img)
+
+        return torch.stack(padded_imgs)
+
+    def max_by_axis(self, the_list):
+        # type: (List[List[int]]) -> List[int]
+        maxes = the_list[0]
+        for sublist in the_list[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    def batch_images(self, images, size_divisible=32):
+        # type: (List[Tensor], int) -> Tensor
+        """
+        将一批图像打包成一个batch返回（注意batch中每个tensor的shape是相同的）
+        Args:
+            images: 输入的一批图片
+            size_divisible: 将图像高和宽调整到该数的整数倍
+
+        Returns:
+            batched_imgs: 打包成一个batch后的tensor数据
+        """
+
+        if torchvision._is_tracing():
+            # batch_images() does not export well to ONNX
+            # call _onnx_batch_images() instead
+            return self._onnx_batch_images(images, size_divisible)
+
+        # 分别计算一个batch中所有图片中的最大channel, height, width
+        max_size = self.max_by_axis([list(img.shape) for img in images])
+
+        stride = float(size_divisible)
+        # max_size = list(max_size)
+        # 将height向上调整到stride的整数倍
+        max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride)
+        # 将width向上调整到stride的整数倍
+        max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride)
+
+        # [batch, channel, height, width]
+        batch_shape = [len(images)] + max_size
+
+        # 创建shape为batch_shape且值全部为0的tensor
+        batched_imgs = images[0].new_full(batch_shape, 0)
+        for img, pad_img in zip(images, batched_imgs):
+            # 将输入images中的每张图片复制到新的batched_imgs的每张图片中，对齐左上角，保证bboxes的坐标不变
+            # 这样保证输入到网络中一个batch的每张图片的shape相同
+            # copy_: Copies the elements from src into self tensor and returns self
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+
+        return batched_imgs
+
+    def postprocess(self,
+                    result,                # type: List[Dict[str, Tensor]]
+                    image_shapes,          # type: List[Tuple[int, int]]
+                    original_image_sizes   # type: List[Tuple[int, int]]
+                    ):
+        # type: (...) -> List[Dict[str, Tensor]]
+        """
+        对网络的预测结果进行后处理（主要将bboxes还原到原图像尺度上）
+        Args:
+            result: list(dict), 网络的预测结果, len(result) == batch_size
+            image_shapes: list(torch.Size), 图像预处理缩放后的尺寸, len(image_shapes) == batch_size
+            original_image_sizes: list(torch.Size), 图像的原始尺寸, len(original_image_sizes) == batch_size
+
+        Returns:
+
+        """
+        if self.training:
+            return result
+
+        # 遍历每张图片的预测信息，将boxes信息还原回原尺度
+        for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
+            boxes = pred["boxes"]
+            boxes = resize_boxes(boxes, im_s, o_im_s)  # 将bboxes缩放回原图像尺度上
+            result[i]["boxes"] = boxes
+            if "masks" in pred:
+                masks = pred["masks"]
+                # 将mask映射回原图尺度
+                masks = paste_masks_in_image(masks, boxes, o_im_s)
+                result[i]["masks"] = masks
+
+        return result
+
+    def __repr__(self):
+        """自定义输出实例化对象的信息，可通过print打印实例信息"""
+        format_string = self.__class__.__name__ + '('
+        _indent = '\n    '
+        format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std)
+        format_string += "{0}Resize(min_size={1}, max_size={2}, mode='bilinear')".format(_indent, self.min_size,
+                                                                                         self.max_size)
+        format_string += '\n)'
+        return format_string
+
+    def forward(self,
+                images,       # type: List[Tensor]
+                targets=None  # type: Optional[List[Dict[str, Tensor]]]
+                ):
+        # type: (...) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]]
+        images = [img for img in images]
+        for i in range(len(images)):
+            image = images[i]
+            target_index = targets[i] if targets is not None else None
+
+            if image.dim() != 3:
+                raise ValueError("images is expected to be a list of 3d tensors "
+                                 "of shape [C, H, W], got {}".format(image.shape))
+            image = self.normalize(image)  # 对图像进行标准化处理
+            image, target_index = self.resize(image, target_index)  # 对图像和对应的bboxes缩放到指定范围
+            images[i] = image
+            if targets is not None and target_index is not None:
+                targets[i] = target_index
+
+        # 记录resize后的图像尺寸
+        image_sizes = [img.shape[-2:] for img in images]
+        images = self.batch_images(images, self.size_divisible)  # 将images打包成一个batch
+        image_sizes_list = torch.jit.annotate(List[Tuple[int, int]], [])
+
+        for image_size in image_sizes:
+            assert len(image_size) == 2
+            image_sizes_list.append((image_size[0], image_size[1]))
+
+        image_list = ImageList(images, image_sizes_list)
+        return image_list, targets
+
+
+def resize_boxes(boxes, original_size, new_size):
+    # type: (Tensor, List[int], List[int]) -> Tensor
+    """
+    将boxes参数根据图像的缩放情况进行相应缩放
+
+    Arguments:
+        original_size: 图像缩放前的尺寸
+        new_size: 图像缩放后的尺寸
+    """
+    ratios = [
+        torch.tensor(s, dtype=torch.float32, device=boxes.device) /
+        torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
+        for s, s_orig in zip(new_size, original_size)
+    ]
+    ratios_height, ratios_width = ratios
+    # Removes a tensor dimension, boxes [minibatch, 4]
+    # Returns a tuple of all slices along a given dimension, already without it.
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    xmin = xmin * ratios_width
+    xmax = xmax * ratios_width
+    ymin = ymin * ratios_height
+    ymax = ymax * ratios_height
+    return torch.stack((xmin, ymin, xmax, ymax), dim=1)
+
+
+
+
+
+
+
+
diff --git a/pytorch_object_detection/mask_rcnn/pascal_voc_indices.json b/pytorch_object_detection/mask_rcnn/pascal_voc_indices.json
new file mode 100644
index 000000000..1c795887b
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/pascal_voc_indices.json
@@ -0,0 +1,22 @@
+{
+    "1": "aeroplane",
+    "2": "bicycle",
+    "3": "bird",
+    "4": "boat",
+    "5": "bottle",
+    "6": "bus",
+    "7": "car",
+    "8": "cat",
+    "9": "chair",
+    "10": "cow",
+    "11": "diningtable",
+    "12": "dog",
+    "13": "horse",
+    "14": "motorbike",
+    "15": "person",
+    "16": "pottedplant",
+    "17": "sheep",
+    "18": "sofa",
+    "19": "train",
+    "20": "tvmonitor"
+}
\ No newline at end of file
diff --git a/pytorch_object_detection/mask_rcnn/plot_curve.py b/pytorch_object_detection/mask_rcnn/plot_curve.py
new file mode 100644
index 000000000..188df710e
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/plot_curve.py
@@ -0,0 +1,46 @@
+import datetime
+import matplotlib.pyplot as plt
+
+
+def plot_loss_and_lr(train_loss, learning_rate):
+    try:
+        x = list(range(len(train_loss)))
+        fig, ax1 = plt.subplots(1, 1)
+        ax1.plot(x, train_loss, 'r', label='loss')
+        ax1.set_xlabel("step")
+        ax1.set_ylabel("loss")
+        ax1.set_title("Train Loss and lr")
+        plt.legend(loc='best')
+
+        ax2 = ax1.twinx()
+        ax2.plot(x, learning_rate, label='lr')
+        ax2.set_ylabel("learning rate")
+        ax2.set_xlim(0, len(train_loss))  # 设置横坐标整数间隔
+        plt.legend(loc='best')
+
+        handles1, labels1 = ax1.get_legend_handles_labels()
+        handles2, labels2 = ax2.get_legend_handles_labels()
+        plt.legend(handles1 + handles2, labels1 + labels2, loc='upper right')
+
+        fig.subplots_adjust(right=0.8)  # 防止出现保存图片显示不全的情况
+        fig.savefig('./loss_and_lr{}.png'.format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
+        plt.close()
+        print("successful save loss curve! ")
+    except Exception as e:
+        print(e)
+
+
+def plot_map(mAP):
+    try:
+        x = list(range(len(mAP)))
+        plt.plot(x, mAP, label='mAp')
+        plt.xlabel('epoch')
+        plt.ylabel('mAP')
+        plt.title('Eval mAP')
+        plt.xlim(0, len(mAP))
+        plt.legend(loc='best')
+        plt.savefig('./mAP.png')
+        plt.close()
+        print("successful save mAP curve!")
+    except Exception as e:
+        print(e)
diff --git a/pytorch_object_detection/mask_rcnn/predict.py b/pytorch_object_detection/mask_rcnn/predict.py
new file mode 100644
index 000000000..46f086756
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/predict.py
@@ -0,0 +1,106 @@
+import os
+import time
+import json
+
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import torch
+from torchvision import transforms
+
+from network_files import MaskRCNN
+from backbone import resnet50_fpn_backbone
+from draw_box_utils import draw_objs
+
+
+def create_model(num_classes, box_thresh=0.5):
+    backbone = resnet50_fpn_backbone()
+    model = MaskRCNN(backbone,
+                     num_classes=num_classes,
+                     rpn_score_thresh=box_thresh,
+                     box_score_thresh=box_thresh)
+
+    return model
+
+
+def time_synchronized():
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    return time.time()
+
+
+def main():
+    num_classes = 90  # 不包含背景
+    box_thresh = 0.5
+    weights_path = "./save_weights/model_25.pth"
+    img_path = "./test.jpg"
+    label_json_path = './coco91_indices.json'
+
+    # get devices
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print("using {} device.".format(device))
+
+    # create model
+    model = create_model(num_classes=num_classes + 1, box_thresh=box_thresh)
+
+    # load train weights
+    assert os.path.exists(weights_path), "{} file dose not exist.".format(weights_path)
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
+    model.to(device)
+
+    # read class_indict
+    assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
+    with open(label_json_path, 'r') as json_file:
+        category_index = json.load(json_file)
+
+    # load image
+    assert os.path.exists(img_path), f"{img_path} does not exits."
+    original_img = Image.open(img_path).convert('RGB')
+
+    # from pil image to tensor, do not normalize image
+    data_transform = transforms.Compose([transforms.ToTensor()])
+    img = data_transform(original_img)
+    # expand batch dimension
+    img = torch.unsqueeze(img, dim=0)
+
+    model.eval()  # 进入验证模式
+    with torch.no_grad():
+        # init
+        img_height, img_width = img.shape[-2:]
+        init_img = torch.zeros((1, 3, img_height, img_width), device=device)
+        model(init_img)
+
+        t_start = time_synchronized()
+        predictions = model(img.to(device))[0]
+        t_end = time_synchronized()
+        print("inference+NMS time: {}".format(t_end - t_start))
+
+        predict_boxes = predictions["boxes"].to("cpu").numpy()
+        predict_classes = predictions["labels"].to("cpu").numpy()
+        predict_scores = predictions["scores"].to("cpu").numpy()
+        predict_mask = predictions["masks"].to("cpu").numpy()
+        predict_mask = np.squeeze(predict_mask, axis=1)  # [batch, 1, h, w] -> [batch, h, w]
+
+        if len(predict_boxes) == 0:
+            print("没有检测到任何目标!")
+            return
+
+        plot_img = draw_objs(original_img,
+                             boxes=predict_boxes,
+                             classes=predict_classes,
+                             scores=predict_scores,
+                             masks=predict_mask,
+                             category_index=category_index,
+                             line_thickness=3,
+                             font='arial.ttf',
+                             font_size=20)
+        plt.imshow(plot_img)
+        plt.show()
+        # 保存预测的图片结果
+        plot_img.save("test_result.jpg")
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/pytorch_object_detection/mask_rcnn/requirements.txt b/pytorch_object_detection/mask_rcnn/requirements.txt
new file mode 100644
index 000000000..9e524e23e
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/requirements.txt
@@ -0,0 +1,8 @@
+lxml
+matplotlib
+numpy
+tqdm
+pycocotools
+Pillow
+torch==1.13.1
+torchvision==0.11.1
diff --git a/pytorch_object_detection/mask_rcnn/seg_results20220406-141544.txt b/pytorch_object_detection/mask_rcnn/seg_results20220406-141544.txt
new file mode 100644
index 000000000..ac46baf82
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/seg_results20220406-141544.txt
@@ -0,0 +1,26 @@
+epoch:0 0.172  0.321  0.167  0.065  0.195  0.250  0.188  0.307  0.324  0.147  0.366  0.440  1.3826  0.08
+epoch:1 0.223  0.395  0.225  0.092  0.249  0.322  0.222  0.354  0.372  0.186  0.413  0.499  1.0356  0.08
+epoch:2 0.235  0.408  0.241  0.100  0.258  0.350  0.230  0.372  0.392  0.204  0.429  0.517  0.9718  0.08
+epoch:3 0.246  0.426  0.252  0.103  0.267  0.357  0.241  0.386  0.408  0.225  0.448  0.521  0.9363  0.08
+epoch:4 0.250  0.424  0.257  0.106  0.272  0.367  0.242  0.381  0.400  0.210  0.438  0.530  0.9145  0.08
+epoch:5 0.255  0.434  0.262  0.109  0.279  0.375  0.242  0.379  0.398  0.209  0.433  0.534  0.8982  0.08
+epoch:6 0.270  0.456  0.283  0.120  0.293  0.392  0.254  0.403  0.421  0.229  0.462  0.551  0.8859  0.08
+epoch:7 0.269  0.455  0.280  0.118  0.296  0.388  0.257  0.402  0.421  0.228  0.454  0.564  0.8771  0.08
+epoch:8 0.276  0.465  0.290  0.120  0.301  0.398  0.255  0.401  0.418  0.227  0.461  0.553  0.8685  0.08
+epoch:9 0.271  0.458  0.282  0.113  0.297  0.404  0.253  0.398  0.417  0.211  0.460  0.570  0.8612  0.08
+epoch:10 0.277  0.463  0.289  0.119  0.299  0.410  0.258  0.405  0.425  0.221  0.466  0.558  0.8547  0.08
+epoch:11 0.276  0.463  0.287  0.122  0.304  0.405  0.259  0.406  0.425  0.236  0.466  0.559  0.8498  0.08
+epoch:12 0.276  0.464  0.288  0.127  0.294  0.409  0.257  0.406  0.425  0.236  0.459  0.563  0.8461  0.08
+epoch:13 0.284  0.477  0.296  0.124  0.311  0.412  0.262  0.407  0.429  0.229  0.474  0.555  0.8409  0.08
+epoch:14 0.277  0.464  0.292  0.121  0.304  0.397  0.257  0.410  0.431  0.238  0.473  0.565  0.8355  0.08
+epoch:15 0.282  0.474  0.296  0.121  0.308  0.413  0.264  0.411  0.432  0.231  0.473  0.575  0.833  0.08
+epoch:16 0.336  0.549  0.356  0.149  0.367  0.491  0.288  0.451  0.473  0.269  0.519  0.620  0.7421  0.008
+epoch:17 0.339  0.553  0.360  0.153  0.371  0.496  0.292  0.454  0.475  0.271  0.518  0.624  0.7157  0.008
+epoch:18 0.340  0.553  0.361  0.150  0.371  0.494  0.290  0.453  0.473  0.269  0.516  0.620  0.7016  0.008
+epoch:19 0.341  0.555  0.363  0.154  0.372  0.500  0.293  0.458  0.478  0.273  0.522  0.630  0.6897  0.008
+epoch:20 0.340  0.554  0.361  0.154  0.370  0.496  0.289  0.450  0.471  0.266  0.514  0.622  0.6802  0.008
+epoch:21 0.338  0.552  0.358  0.151  0.367  0.500  0.289  0.447  0.467  0.262  0.507  0.622  0.6708  0.008
+epoch:22 0.340  0.553  0.360  0.151  0.370  0.500  0.290  0.450  0.470  0.267  0.513  0.623  0.6497  0.0008
+epoch:23 0.340  0.552  0.361  0.151  0.369  0.500  0.290  0.449  0.468  0.266  0.509  0.619  0.6447  0.0008
+epoch:24 0.339  0.552  0.359  0.150  0.369  0.500  0.290  0.448  0.468  0.264  0.510  0.619  0.6421  0.0008
+epoch:25 0.338  0.551  0.359  0.152  0.367  0.500  0.289  0.448  0.467  0.264  0.509  0.618  0.6398  0.0008
diff --git a/pytorch_object_detection/mask_rcnn/train.py b/pytorch_object_detection/mask_rcnn/train.py
new file mode 100644
index 000000000..3f5179d61
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/train.py
@@ -0,0 +1,240 @@
+import os
+import datetime
+
+import torch
+from torchvision.ops.misc import FrozenBatchNorm2d
+
+import transforms
+from network_files import MaskRCNN
+from backbone import resnet50_fpn_backbone
+from my_dataset_coco import CocoDetection
+from my_dataset_voc import VOCInstances
+from train_utils import train_eval_utils as utils
+from train_utils import GroupedBatchSampler, create_aspect_ratio_groups
+
+
+def create_model(num_classes, load_pretrain_weights=True):
+    # 如果GPU显存很小，batch_size不能设置很大，建议将norm_layer设置成FrozenBatchNorm2d(默认是nn.BatchNorm2d)
+    # FrozenBatchNorm2d的功能与BatchNorm2d类似，但参数无法更新
+    # trainable_layers包括['layer4', 'layer3', 'layer2', 'layer1', 'conv1']， 5代表全部训练
+    # backbone = resnet50_fpn_backbone(norm_layer=FrozenBatchNorm2d,
+    #                                  trainable_layers=3)
+    # resnet50 imagenet weights url: https://download.pytorch.org/models/resnet50-0676ba61.pth
+    backbone = resnet50_fpn_backbone(pretrain_path="resnet50.pth", trainable_layers=3)
+
+    model = MaskRCNN(backbone, num_classes=num_classes)
+
+    if load_pretrain_weights:
+        # coco weights url: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth"
+        weights_dict = torch.load("./maskrcnn_resnet50_fpn_coco.pth", map_location="cpu")
+        for k in list(weights_dict.keys()):
+            if ("box_predictor" in k) or ("mask_fcn_logits" in k):
+                del weights_dict[k]
+
+        print(model.load_state_dict(weights_dict, strict=False))
+
+    return model
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    print("Using {} device training.".format(device.type))
+
+    # 用来保存coco_info的文件
+    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    det_results_file = f"det_results{now}.txt"
+    seg_results_file = f"seg_results{now}.txt"
+
+    data_transform = {
+        "train": transforms.Compose([transforms.ToTensor(),
+                                     transforms.RandomHorizontalFlip(0.5)]),
+        "val": transforms.Compose([transforms.ToTensor()])
+    }
+
+    data_root = args.data_path
+
+    # load train data set
+    # coco2017 -> annotations -> instances_train2017.json
+    train_dataset = CocoDetection(data_root, "train", data_transform["train"])
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt
+    # train_dataset = VOCInstances(data_root, year="2012", txt_name="train.txt", transforms=data_transform["train"])
+    train_sampler = None
+
+    # 是否按图片相似高宽比采样图片组成batch
+    # 使用的话能够减小训练时所需GPU显存，默认使用
+    if args.aspect_ratio_group_factor >= 0:
+        train_sampler = torch.utils.data.RandomSampler(train_dataset)
+        # 统计所有图像高宽比例在bins区间中的位置索引
+        group_ids = create_aspect_ratio_groups(train_dataset, k=args.aspect_ratio_group_factor)
+        # 每个batch图片从同一高宽比例区间中取
+        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
+
+    # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using %g dataloader workers' % nw)
+
+    if train_sampler:
+        # 如果按照图片高宽比采样图片，dataloader中需要使用batch_sampler
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_sampler=train_batch_sampler,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+    else:
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_size=batch_size,
+                                                        shuffle=True,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+
+    # load validation data set
+    # coco2017 -> annotations -> instances_val2017.json
+    val_dataset = CocoDetection(data_root, "val", data_transform["val"])
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
+    # val_dataset = VOCInstances(data_root, year="2012", txt_name="val.txt", transforms=data_transform["val"])
+    val_data_loader = torch.utils.data.DataLoader(val_dataset,
+                                                  batch_size=1,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=nw,
+                                                  collate_fn=train_dataset.collate_fn)
+
+    # create model num_classes equal background + classes
+    model = create_model(num_classes=args.num_classes + 1, load_pretrain_weights=args.pretrain)
+    model.to(device)
+
+    train_loss = []
+    learning_rate = []
+    val_map = []
+
+    # define optimizer
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.SGD(params, lr=args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    # learning rate scheduler
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
+                                                        milestones=args.lr_steps,
+                                                        gamma=args.lr_gamma)
+    # 如果传入resume参数，即上次训练的权重地址，则接着上次的参数训练
+    if args.resume:
+        # If map_location is missing, torch.load will first load the module to CPU
+        # and then copy each parameter to where it was saved,
+        # which would result in all processes on the same machine using the same set of devices.
+        checkpoint = torch.load(args.resume, map_location='cpu')  # 读取之前保存的权重文件(包括优化器以及学习率策略)
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp and "scaler" in checkpoint:
+            scaler.load_state_dict(checkpoint["scaler"])
+
+    for epoch in range(args.start_epoch, args.epochs):
+        # train for one epoch, printing every 50 iterations
+        mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
+                                              device, epoch, print_freq=50,
+                                              warmup=True, scaler=scaler)
+        train_loss.append(mean_loss.item())
+        learning_rate.append(lr)
+
+        # update the learning rate
+        lr_scheduler.step()
+
+        # evaluate on the test dataset
+        det_info, seg_info = utils.evaluate(model, val_data_loader, device=device)
+
+        # write detection into txt
+        with open(det_results_file, "a") as f:
+            # 写入的数据包括coco指标还有loss和learning rate
+            result_info = [f"{i:.4f}" for i in det_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+            f.write(txt + "\n")
+
+        # write seg into txt
+        with open(seg_results_file, "a") as f:
+            # 写入的数据包括coco指标还有loss和learning rate
+            result_info = [f"{i:.4f}" for i in seg_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+            f.write(txt + "\n")
+
+        val_map.append(det_info[1])  # pascal mAP
+
+        # save weights
+        save_files = {
+            'model': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'lr_scheduler': lr_scheduler.state_dict(),
+            'epoch': epoch}
+        if args.amp:
+            save_files["scaler"] = scaler.state_dict()
+        torch.save(save_files, "./save_weights/model_{}.pth".format(epoch))
+
+    # plot loss and lr curve
+    if len(train_loss) != 0 and len(learning_rate) != 0:
+        from plot_curve import plot_loss_and_lr
+        plot_loss_and_lr(train_loss, learning_rate)
+
+    # plot mAP curve
+    if len(val_map) != 0:
+        from plot_curve import plot_map
+        plot_map(val_map)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 训练设备类型
+    parser.add_argument('--device', default='cuda:0', help='device')
+    # 训练数据集的根目录
+    parser.add_argument('--data-path', default='/data/coco2017', help='dataset')
+    # 检测目标类别数(不包含背景)
+    parser.add_argument('--num-classes', default=90, type=int, help='num_classes')
+    # 文件保存地址
+    parser.add_argument('--output-dir', default='./save_weights', help='path where to save')
+    # 若需要接着上次训练，则指定上次训练保存权重文件地址
+    parser.add_argument('--resume', default='', type=str, help='resume from checkpoint')
+    # 指定接着从哪个epoch数开始训练
+    parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
+    # 训练的总epoch数
+    parser.add_argument('--epochs', default=26, type=int, metavar='N',
+                        help='number of total epochs to run')
+    # 学习率
+    parser.add_argument('--lr', default=0.004, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                             'on 8 gpus and 2 images_per_gpu')
+    # SGD的momentum参数
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    # SGD的weight_decay参数
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int,
+                        help='decrease lr every step-size epochs')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
+    # 训练的batch size(如果内存/GPU显存充裕，建议设置更大)
+    parser.add_argument('--batch_size', default=2, type=int, metavar='N',
+                        help='batch size when training.')
+    parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
+    parser.add_argument("--pretrain", type=bool, default=True, help="load COCO pretrain weights.")
+    # 是否使用混合精度训练(需要GPU支持混合精度)
+    parser.add_argument("--amp", default=False, help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+    print(args)
+
+    # 检查保存权重文件夹是否存在，不存在则创建
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    main(args)
diff --git a/pytorch_object_detection/mask_rcnn/train_multi_GPU.py b/pytorch_object_detection/mask_rcnn/train_multi_GPU.py
new file mode 100644
index 000000000..05647edef
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/train_multi_GPU.py
@@ -0,0 +1,268 @@
+import time
+import os
+import datetime
+
+import torch
+from torchvision.ops.misc import FrozenBatchNorm2d
+
+import transforms
+from my_dataset_coco import CocoDetection
+from my_dataset_voc import VOCInstances
+from backbone import resnet50_fpn_backbone
+from network_files import MaskRCNN
+import train_utils.train_eval_utils as utils
+from train_utils import GroupedBatchSampler, create_aspect_ratio_groups, init_distributed_mode, save_on_master, mkdir
+
+
+def create_model(num_classes, load_pretrain_weights=True):
+    # 如果GPU显存很小，batch_size不能设置很大，建议将norm_layer设置成FrozenBatchNorm2d(默认是nn.BatchNorm2d)
+    # FrozenBatchNorm2d的功能与BatchNorm2d类似，但参数无法更新
+    # trainable_layers包括['layer4', 'layer3', 'layer2', 'layer1', 'conv1']， 5代表全部训练
+    # backbone = resnet50_fpn_backbone(norm_layer=FrozenBatchNorm2d,
+    #                                  trainable_layers=3)
+    # resnet50 imagenet weights url: https://download.pytorch.org/models/resnet50-0676ba61.pth
+    backbone = resnet50_fpn_backbone(pretrain_path="resnet50.pth", trainable_layers=3)
+    model = MaskRCNN(backbone, num_classes=num_classes)
+
+    if load_pretrain_weights:
+        # coco weights url: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth"
+        weights_dict = torch.load("./maskrcnn_resnet50_fpn_coco.pth", map_location="cpu")
+        for k in list(weights_dict.keys()):
+            if ("box_predictor" in k) or ("mask_fcn_logits" in k):
+                del weights_dict[k]
+
+        print(model.load_state_dict(weights_dict, strict=False))
+
+    return model
+
+
+def main(args):
+    init_distributed_mode(args)
+    print(args)
+
+    device = torch.device(args.device)
+
+    # 用来保存coco_info的文件
+    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    det_results_file = f"det_results{now}.txt"
+    seg_results_file = f"seg_results{now}.txt"
+
+    # Data loading code
+    print("Loading data")
+
+    data_transform = {
+        "train": transforms.Compose([transforms.ToTensor(),
+                                     transforms.RandomHorizontalFlip(0.5)]),
+        "val": transforms.Compose([transforms.ToTensor()])
+    }
+
+    COCO_root = args.data_path
+
+    # load train data set
+    # coco2017 -> annotations -> instances_train2017.json
+    train_dataset = CocoDetection(COCO_root, "train", data_transform["train"])
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt
+    # train_dataset = VOCInstances(data_root, year="2012", txt_name="train.txt")
+
+    # load validation data set
+    # coco2017 -> annotations -> instances_val2017.json
+    val_dataset = CocoDetection(COCO_root, "val", data_transform["val"])
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
+    # val_dataset = VOCInstances(data_root, year="2012", txt_name="val.txt")
+
+    print("Creating data loaders")
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        test_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+    else:
+        train_sampler = torch.utils.data.RandomSampler(train_dataset)
+        test_sampler = torch.utils.data.SequentialSampler(val_dataset)
+
+    if args.aspect_ratio_group_factor >= 0:
+        # 统计所有图像比例在bins区间中的位置索引
+        group_ids = create_aspect_ratio_groups(train_dataset, k=args.aspect_ratio_group_factor)
+        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
+    else:
+        train_batch_sampler = torch.utils.data.BatchSampler(
+            train_sampler, args.batch_size, drop_last=True)
+
+    data_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
+        collate_fn=train_dataset.collate_fn)
+
+    data_loader_test = torch.utils.data.DataLoader(
+        val_dataset, batch_size=1,
+        sampler=test_sampler, num_workers=args.workers,
+        collate_fn=train_dataset.collate_fn)
+
+    print("Creating model")
+    # create model num_classes equal background + classes
+    model = create_model(num_classes=args.num_classes + 1, load_pretrain_weights=args.pretrain)
+    model.to(device)
+
+    if args.distributed and args.sync_bn:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.SGD(
+        params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
+
+    # 如果传入resume参数，即上次训练的权重地址，则接着上次的参数训练
+    if args.resume:
+        # If map_location is missing, torch.load will first load the module to CPU
+        # and then copy each parameter to where it was saved,
+        # which would result in all processes on the same machine using the same set of devices.
+        checkpoint = torch.load(args.resume, map_location='cpu')  # 读取之前保存的权重文件(包括优化器以及学习率策略)
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp and "scaler" in checkpoint:
+            scaler.load_state_dict(checkpoint["scaler"])
+
+    if args.test_only:
+        utils.evaluate(model, data_loader_test, device=device)
+        return
+
+    train_loss = []
+    learning_rate = []
+    val_map = []
+
+    print("Start training")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader,
+                                              device, epoch, args.print_freq,
+                                              warmup=True, scaler=scaler)
+
+        # update learning rate
+        lr_scheduler.step()
+
+        # evaluate after every epoch
+        det_info, seg_info = utils.evaluate(model, data_loader_test, device=device)
+
+        # 只在主进程上进行写操作
+        if args.rank in [-1, 0]:
+            train_loss.append(mean_loss.item())
+            learning_rate.append(lr)
+            val_map.append(det_info[1])  # pascal mAP
+
+            # write into txt
+            with open(det_results_file, "a") as f:
+                # 写入的数据包括coco指标还有loss和learning rate
+                result_info = [f"{i:.4f}" for i in det_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+                txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+                f.write(txt + "\n")
+
+            with open(seg_results_file, "a") as f:
+                # 写入的数据包括coco指标还有loss和learning rate
+                result_info = [f"{i:.4f}" for i in seg_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+                txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+                f.write(txt + "\n")
+
+        if args.output_dir:
+            # 只在主进程上执行保存权重操作
+            save_files = {'model': model_without_ddp.state_dict(),
+                          'optimizer': optimizer.state_dict(),
+                          'lr_scheduler': lr_scheduler.state_dict(),
+                          'args': args,
+                          'epoch': epoch}
+            if args.amp:
+                save_files["scaler"] = scaler.state_dict()
+            save_on_master(save_files,
+                           os.path.join(args.output_dir, f'model_{epoch}.pth'))
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+    if args.rank in [-1, 0]:
+        # plot loss and lr curve
+        if len(train_loss) != 0 and len(learning_rate) != 0:
+            from plot_curve import plot_loss_and_lr
+            plot_loss_and_lr(train_loss, learning_rate)
+
+        # plot mAP curve
+        if len(val_map) != 0:
+            from plot_curve import plot_map
+            plot_map(val_map)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 训练文件的根目录(coco2017)
+    parser.add_argument('--data-path', default='/data/coco2017', help='dataset')
+    # 训练设备类型
+    parser.add_argument('--device', default='cuda', help='device')
+    # 检测目标类别数(不包含背景)
+    parser.add_argument('--num-classes', default=90, type=int, help='num_classes')
+    # 每块GPU上的batch_size
+    parser.add_argument('-b', '--batch-size', default=4, type=int,
+                        help='images per gpu, the total batch size is $NGPU x batch_size')
+    # 指定接着从哪个epoch数开始训练
+    parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
+    # 训练的总epoch数
+    parser.add_argument('--epochs', default=26, type=int, metavar='N',
+                        help='number of total epochs to run')
+    # 数据加载以及预处理的线程数
+    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                        help='number of data loading workers (default: 4)')
+    # 学习率，这个需要根据gpu的数量以及batch_size进行设置0.02 / bs * num_GPU
+    parser.add_argument('--lr', default=0.005, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                             'on 8 gpus and 2 images_per_gpu')
+    # SGD的momentum参数
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    # SGD的weight_decay参数
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    # 针对torch.optim.lr_scheduler.StepLR的参数
+    parser.add_argument('--lr-step-size', default=8, type=int, help='decrease lr every step-size epochs')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int,
+                        help='decrease lr every step-size epochs')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
+    # 训练过程打印信息的频率
+    parser.add_argument('--print-freq', default=50, type=int, help='print frequency')
+    # 文件保存地址
+    parser.add_argument('--output-dir', default='./multi_train', help='path where to save')
+    # 基于上次的训练结果接着训练
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
+    parser.add_argument('--test-only', action="http://github.com/store_true", help="test only")
+
+    # 开启的进程数(注意不是线程)
+    parser.add_argument('--world-size', default=4, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+    parser.add_argument("--sync-bn", dest="sync_bn", help="Use sync batch norm", type=bool, default=False)
+    parser.add_argument("--pretrain", type=bool, default=True, help="load COCO pretrain weights.")
+    # 是否使用混合精度训练(需要GPU支持混合精度)
+    parser.add_argument("--amp", default=False, help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+
+    # 如果指定了保存文件地址，检查文件夹是否存在，若不存在，则创建
+    if args.output_dir:
+        mkdir(args.output_dir)
+
+    main(args)
diff --git a/pytorch_object_detection/mask_rcnn/train_utils/__init__.py b/pytorch_object_detection/mask_rcnn/train_utils/__init__.py
new file mode 100644
index 000000000..3dfa7eadc
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/train_utils/__init__.py
@@ -0,0 +1,4 @@
+from .group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
+from .distributed_utils import init_distributed_mode, save_on_master, mkdir
+from .coco_eval import EvalCOCOMetric
+from .coco_utils import coco_remove_images_without_annotations, convert_coco_poly_mask, convert_to_coco_api
diff --git a/pytorch_object_detection/mask_rcnn/train_utils/coco_eval.py b/pytorch_object_detection/mask_rcnn/train_utils/coco_eval.py
new file mode 100644
index 000000000..b8df0204d
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/train_utils/coco_eval.py
@@ -0,0 +1,163 @@
+import json
+import copy
+
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import pycocotools.mask as mask_util
+from .distributed_utils import all_gather, is_main_process
+
+
+def merge(img_ids, eval_results):
+    """将多个进程之间的数据汇总在一起"""
+    all_img_ids = all_gather(img_ids)
+    all_eval_results = all_gather(eval_results)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_results = []
+    for p in all_eval_results:
+        merged_eval_results.extend(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+
+    # keep only unique (and in sorted order) images
+    # 去除重复的图片索引，多GPU训练时为了保证每个进程的训练图片数量相同，可能将一张图片分配给多个进程
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_results = [merged_eval_results[i] for i in idx]
+
+    return list(merged_img_ids), merged_eval_results
+
+
+class EvalCOCOMetric:
+    def __init__(self,
+                 coco: COCO = None,
+                 iou_type: str = None,
+                 results_file_name: str = "predict_results.json",
+                 classes_mapping: dict = None):
+        self.coco = copy.deepcopy(coco)
+        self.img_ids = []  # 记录每个进程处理图片的ids
+        self.results = []
+        self.aggregation_results = None
+        self.classes_mapping = classes_mapping
+        self.coco_evaluator = None
+        assert iou_type in ["bbox", "segm", "keypoints"]
+        self.iou_type = iou_type
+        self.results_file_name = results_file_name
+
+    def prepare_for_coco_detection(self, targets, outputs):
+        """将预测的结果转换成COCOeval指定的格式，针对目标检测任务"""
+        # 遍历每张图像的预测结果
+        for target, output in zip(targets, outputs):
+            if len(output) == 0:
+                continue
+
+            img_id = int(target["image_id"])
+            if img_id in self.img_ids:
+                # 防止出现重复的数据
+                continue
+            self.img_ids.append(img_id)
+            per_image_boxes = output["boxes"]
+            # 对于coco_eval, 需要的每个box的数据格式为[x_min, y_min, w, h]
+            # 而我们预测的box格式是[x_min, y_min, x_max, y_max]，所以需要转下格式
+            per_image_boxes[:, 2:] -= per_image_boxes[:, :2]
+            per_image_classes = output["labels"].tolist()
+            per_image_scores = output["scores"].tolist()
+
+            res_list = []
+            # 遍历每个目标的信息
+            for object_score, object_class, object_box in zip(
+                    per_image_scores, per_image_classes, per_image_boxes):
+                object_score = float(object_score)
+                class_idx = int(object_class)
+                if self.classes_mapping is not None:
+                    class_idx = int(self.classes_mapping[str(class_idx)])
+                # We recommend rounding coordinates to the nearest tenth of a pixel
+                # to reduce resulting JSON file size.
+                object_box = [round(b, 2) for b in object_box.tolist()]
+
+                res = {"image_id": img_id,
+                       "category_id": class_idx,
+                       "bbox": object_box,
+                       "score": round(object_score, 3)}
+                res_list.append(res)
+            self.results.append(res_list)
+
+    def prepare_for_coco_segmentation(self, targets, outputs):
+        """将预测的结果转换成COCOeval指定的格式，针对实例分割任务"""
+        # 遍历每张图像的预测结果
+        for target, output in zip(targets, outputs):
+            if len(output) == 0:
+                continue
+
+            img_id = int(target["image_id"])
+            if img_id in self.img_ids:
+                # 防止出现重复的数据
+                continue
+
+            self.img_ids.append(img_id)
+            per_image_masks = output["masks"]
+            per_image_classes = output["labels"].tolist()
+            per_image_scores = output["scores"].tolist()
+
+            masks = per_image_masks > 0.5
+
+            res_list = []
+            # 遍历每个目标的信息
+            for mask, label, score in zip(masks, per_image_classes, per_image_scores):
+                rle = mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+                class_idx = int(label)
+                if self.classes_mapping is not None:
+                    class_idx = int(self.classes_mapping[str(class_idx)])
+
+                res = {"image_id": img_id,
+                       "category_id": class_idx,
+                       "segmentation": rle,
+                       "score": round(score, 3)}
+                res_list.append(res)
+            self.results.append(res_list)
+
+    def update(self, targets, outputs):
+        if self.iou_type == "bbox":
+            self.prepare_for_coco_detection(targets, outputs)
+        elif self.iou_type == "segm":
+            self.prepare_for_coco_segmentation(targets, outputs)
+        else:
+            raise KeyError(f"not support iou_type: {self.iou_type}")
+
+    def synchronize_results(self):
+        # 同步所有进程中的数据
+        eval_ids, eval_results = merge(self.img_ids, self.results)
+        self.aggregation_results = {"img_ids": eval_ids, "results": eval_results}
+
+        # 主进程上保存即可
+        if is_main_process():
+            results = []
+            [results.extend(i) for i in eval_results]
+            # write predict results into json file
+            json_str = json.dumps(results, indent=4)
+            with open(self.results_file_name, 'w') as json_file:
+                json_file.write(json_str)
+
+    def evaluate(self):
+        # 只在主进程上评估即可
+        if is_main_process():
+            # accumulate predictions from all images
+            coco_true = self.coco
+            coco_pre = coco_true.loadRes(self.results_file_name)
+
+            self.coco_evaluator = COCOeval(cocoGt=coco_true, cocoDt=coco_pre, iouType=self.iou_type)
+
+            self.coco_evaluator.evaluate()
+            self.coco_evaluator.accumulate()
+            print(f"IoU metric: {self.iou_type}")
+            self.coco_evaluator.summarize()
+
+            coco_info = self.coco_evaluator.stats.tolist()  # numpy to list
+            return coco_info
+        else:
+            return None
diff --git a/pytorch_object_detection/mask_rcnn/train_utils/coco_utils.py b/pytorch_object_detection/mask_rcnn/train_utils/coco_utils.py
new file mode 100644
index 000000000..7a3b3122e
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/train_utils/coco_utils.py
@@ -0,0 +1,98 @@
+import torch
+import torch.utils.data
+from pycocotools import mask as coco_mask
+from pycocotools.coco import COCO
+
+
+def coco_remove_images_without_annotations(dataset, ids):
+    """
+    删除coco数据集中没有目标，或者目标面积非常小的数据
+    refer to:
+    https://github.com/pytorch/vision/blob/master/references/detection/coco_utils.py
+    :param dataset:
+    :param cat_list:
+    :return:
+    """
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+
+        return True
+
+    valid_ids = []
+    for ds_idx, img_id in enumerate(ids):
+        ann_ids = dataset.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.loadAnns(ann_ids)
+
+        if _has_valid_annotation(anno):
+            valid_ids.append(img_id)
+
+    return valid_ids
+
+
+def convert_coco_poly_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        # 如果mask为空，则说明没有目标，直接返回数值为0的mask
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+def convert_to_coco_api(self):
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {"images": [], "categories": [], "annotations": []}
+    categories = set()
+    for img_idx in range(len(self)):
+        targets, h, w = self.get_annotations(img_idx)
+        img_id = targets["image_id"].item()
+        img_dict = {"id": img_id,
+                    "height": h,
+                    "width": w}
+        dataset["images"].append(img_dict)
+        bboxes = targets["boxes"].clone()
+        # convert (x_min, ymin, xmax, ymax) to (xmin, ymin, w, h)
+        bboxes[:, 2:] -= bboxes[:, :2]
+        bboxes = bboxes.tolist()
+        labels = targets["labels"].tolist()
+        areas = targets["area"].tolist()
+        iscrowd = targets["iscrowd"].tolist()
+        if "masks" in targets:
+            masks = targets["masks"]
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {"image_id": img_id,
+                   "bbox": bboxes[i],
+                   "category_id": labels[i],
+                   "area": areas[i],
+                   "iscrowd": iscrowd[i],
+                   "id": ann_id}
+            categories.add(labels[i])
+            if "masks" in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            dataset["annotations"].append(ann)
+            ann_id += 1
+    dataset["categories"] = [{"id": i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
diff --git a/pytorch_object_detection/mask_rcnn/train_utils/distributed_utils.py b/pytorch_object_detection/mask_rcnn/train_utils/distributed_utils.py
new file mode 100644
index 000000000..80b2412c6
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/train_utils/distributed_utils.py
@@ -0,0 +1,299 @@
+from collections import defaultdict, deque
+import datetime
+import pickle
+import time
+import errno
+import os
+
+import torch
+import torch.distributed as dist
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{value:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)  # deque简单理解成加强版list
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):  # @property 是装饰器，这里可简单理解为增加median属性(只读)
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    收集各个进程中的数据
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()  # 进程数
+    if world_size == 1:
+        return [data]
+
+    data_list = [None] * world_size
+    dist.all_gather_object(data_list, data)
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:  # 单GPU的情况
+        return input_dict
+    with torch.no_grad():  # 多GPU的情况
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+
+        reduced_dict = {k: v for k, v in zip(names, values)}
+        return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([header,
+                                           '[{0' + space_fmt + '}/{1}]',
+                                           'eta: {eta}',
+                                           '{meters}',
+                                           'time: {time}',
+                                           'data: {data}',
+                                           'max mem: {memory:.0f}'])
+        else:
+            log_msg = self.delimiter.join([header,
+                                           '[{0' + space_fmt + '}/{1}]',
+                                           'eta: {eta}',
+                                           '{meters}',
+                                           'time: {time}',
+                                           'data: {data}'])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_second = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=eta_second))
+                if torch.cuda.is_available():
+                    print(log_msg.format(i, len(iterable),
+                                         eta=eta_string,
+                                         meters=str(self),
+                                         time=str(iter_time),
+                                         data=str(data_time),
+                                         memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(i, len(iterable),
+                                         eta=eta_string,
+                                         meters=str(self),
+                                         time=str(iter_time),
+                                         data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(header,
+                                                         total_time_str,
+
+                                                         total_time / len(iterable)))
+
+
+def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor):
+
+    def f(x):
+        """根据step数返回一个学习率倍率因子"""
+        if x >= warmup_iters:  # 当迭代数大于给定的warmup_iters时，倍率因子为1
+            return 1
+        alpha = float(x) / warmup_iters
+        # 迭代过程中倍率因子从warmup_factor -> 1
+        return warmup_factor * (1 - alpha) + alpha
+
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    """检查是否支持分布式环境"""
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
diff --git a/pytorch_object_detection/mask_rcnn/train_utils/group_by_aspect_ratio.py b/pytorch_object_detection/mask_rcnn/train_utils/group_by_aspect_ratio.py
new file mode 100644
index 000000000..e7b8b9e88
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/train_utils/group_by_aspect_ratio.py
@@ -0,0 +1,201 @@
+import bisect
+from collections import defaultdict
+import copy
+from itertools import repeat, chain
+import math
+import numpy as np
+
+import torch
+import torch.utils.data
+from torch.utils.data.sampler import BatchSampler, Sampler
+from torch.utils.model_zoo import tqdm
+import torchvision
+
+from PIL import Image
+
+
+def _repeat_to_at_least(iterable, n):
+    repeat_times = math.ceil(n / len(iterable))
+    repeated = chain.from_iterable(repeat(iterable, repeat_times))
+    return list(repeated)
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    Arguments:
+        sampler (Sampler): Base sampler.
+        group_ids (list[int]): If the sampler produces indices in range [0, N),
+            `group_ids` must be a list of `N` ints which contains the group id of each sample.
+            The group ids must be a continuous set of integers starting from
+            0, i.e. they must be in the range [0, num_groups).
+        batch_size (int): Size of mini-batch.
+    """
+    def __init__(self, sampler, group_ids, batch_size):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = group_ids
+        self.batch_size = batch_size
+
+    def __iter__(self):
+        buffer_per_group = defaultdict(list)
+        samples_per_group = defaultdict(list)
+
+        num_batches = 0
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            buffer_per_group[group_id].append(idx)
+            samples_per_group[group_id].append(idx)
+            if len(buffer_per_group[group_id]) == self.batch_size:
+                yield buffer_per_group[group_id]
+                num_batches += 1
+                del buffer_per_group[group_id]
+            assert len(buffer_per_group[group_id]) < self.batch_size
+
+        # now we have run out of elements that satisfy
+        # the group criteria, let's return the remaining
+        # elements so that the size of the sampler is
+        # deterministic
+        expected_num_batches = len(self)
+        num_remaining = expected_num_batches - num_batches
+        if num_remaining > 0:
+            # for the remaining batches, take first the buffers with largest number
+            # of elements
+            for group_id, _ in sorted(buffer_per_group.items(),
+                                      key=lambda x: len(x[1]), reverse=True):
+                remaining = self.batch_size - len(buffer_per_group[group_id])
+                samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining)
+                buffer_per_group[group_id].extend(samples_from_group_id[:remaining])
+                assert len(buffer_per_group[group_id]) == self.batch_size
+                yield buffer_per_group[group_id]
+                num_remaining -= 1
+                if num_remaining == 0:
+                    break
+        assert num_remaining == 0
+
+    def __len__(self):
+        return len(self.sampler) // self.batch_size
+
+
+def _compute_aspect_ratios_slow(dataset, indices=None):
+    print("Your dataset doesn't support the fast path for "
+          "computing the aspect ratios, so will iterate over "
+          "the full dataset and load every image instead. "
+          "This might take some time...")
+    if indices is None:
+        indices = range(len(dataset))
+
+    class SubsetSampler(Sampler):
+        def __init__(self, indices):
+            self.indices = indices
+
+        def __iter__(self):
+            return iter(self.indices)
+
+        def __len__(self):
+            return len(self.indices)
+
+    sampler = SubsetSampler(indices)
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_size=1, sampler=sampler,
+        num_workers=14,  # you might want to increase it for faster processing
+        collate_fn=lambda x: x[0])
+    aspect_ratios = []
+    with tqdm(total=len(dataset)) as pbar:
+        for _i, (img, _) in enumerate(data_loader):
+            pbar.update(1)
+            height, width = img.shape[-2:]
+            aspect_ratio = float(width) / float(height)
+            aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_custom_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        height, width = dataset.get_height_and_width(i)
+        aspect_ratio = float(width) / float(height)
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_coco_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        img_info = dataset.coco.imgs[dataset.ids[i]]
+        aspect_ratio = float(img_info["width"]) / float(img_info["height"])
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_voc_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        # this doesn't load the data into memory, because PIL loads it lazily
+        width, height = Image.open(dataset.images[i]).size
+        aspect_ratio = float(width) / float(height)
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_subset_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+
+    ds_indices = [dataset.indices[i] for i in indices]
+    return compute_aspect_ratios(dataset.dataset, ds_indices)
+
+
+def compute_aspect_ratios(dataset, indices=None):
+    if hasattr(dataset, "get_height_and_width"):
+        return _compute_aspect_ratios_custom_dataset(dataset, indices)
+
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return _compute_aspect_ratios_coco_dataset(dataset, indices)
+
+    if isinstance(dataset, torchvision.datasets.VOCDetection):
+        return _compute_aspect_ratios_voc_dataset(dataset, indices)
+
+    if isinstance(dataset, torch.utils.data.Subset):
+        return _compute_aspect_ratios_subset_dataset(dataset, indices)
+
+    # slow path
+    return _compute_aspect_ratios_slow(dataset, indices)
+
+
+def _quantize(x, bins):
+    bins = copy.deepcopy(bins)
+    bins = sorted(bins)
+    # bisect_right：寻找y元素按顺序应该排在bins中哪个元素的右边，返回的是索引
+    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    return quantized
+
+
+def create_aspect_ratio_groups(dataset, k=0):
+    # 计算所有数据集中的图片width/height比例
+    aspect_ratios = compute_aspect_ratios(dataset)
+    # 将[0.5, 2]区间划分成2*k+1等份
+    bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0]
+
+    # 统计所有图像比例在bins区间中的位置索引
+    groups = _quantize(aspect_ratios, bins)
+    # count number of elements per group
+    # 统计每个区间的频次
+    counts = np.unique(groups, return_counts=True)[1]
+    fbins = [0] + bins + [np.inf]
+    print("Using {} as bins for aspect ratio quantization".format(fbins))
+    print("Count of instances per bin: {}".format(counts))
+    return groups
diff --git a/pytorch_object_detection/mask_rcnn/train_utils/train_eval_utils.py b/pytorch_object_detection/mask_rcnn/train_utils/train_eval_utils.py
new file mode 100644
index 000000000..29bae2fb2
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/train_utils/train_eval_utils.py
@@ -0,0 +1,109 @@
+import math
+import sys
+import time
+
+import torch
+
+import train_utils.distributed_utils as utils
+from .coco_eval import EvalCOCOMetric
+
+
+def train_one_epoch(model, optimizer, data_loader, device, epoch,
+                    print_freq=50, warmup=False, scaler=None):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    lr_scheduler = None
+    if epoch == 0 and warmup is True:  # 当训练第一轮（epoch=0）时，启用warmup训练方式，可理解为热身训练
+        warmup_factor = 1.0 / 1000
+        warmup_iters = min(1000, len(data_loader) - 1)
+
+        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
+
+    mloss = torch.zeros(1).to(device)  # mean losses
+    for i, [images, targets] in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+        images = list(image.to(device) for image in images)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        # 混合精度训练上下文管理器，如果在CPU环境中不起任何作用
+        with torch.cuda.amp.autocast(enabled=scaler is not None):
+            loss_dict = model(images, targets)
+
+            losses = sum(loss for loss in loss_dict.values())
+
+        # reduce losses over all GPUs for logging purpose
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        loss_value = losses_reduced.item()
+        # 记录训练损失
+        mloss = (mloss * i + loss_value) / (i + 1)  # update mean losses
+
+        if not math.isfinite(loss_value):  # 当计算的损失为无穷大时停止训练
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        optimizer.zero_grad()
+        if scaler is not None:
+            scaler.scale(losses).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            losses.backward()
+            optimizer.step()
+
+        if lr_scheduler is not None:  # 第一轮使用warmup训练方式
+            lr_scheduler.step()
+
+        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
+        now_lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=now_lr)
+
+    return mloss, now_lr
+
+
+@torch.no_grad()
+def evaluate(model, data_loader, device):
+    cpu_device = torch.device("cpu")
+    model.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = "Test: "
+
+    det_metric = EvalCOCOMetric(data_loader.dataset.coco, iou_type="bbox", results_file_name="det_results.json")
+    seg_metric = EvalCOCOMetric(data_loader.dataset.coco, iou_type="segm", results_file_name="seg_results.json")
+    for image, targets in metric_logger.log_every(data_loader, 100, header):
+        image = list(img.to(device) for img in image)
+
+        # 当使用CPU时，跳过GPU相关指令
+        if device != torch.device("cpu"):
+            torch.cuda.synchronize(device)
+
+        model_time = time.time()
+        outputs = model(image)
+
+        outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
+        model_time = time.time() - model_time
+
+        det_metric.update(targets, outputs)
+        seg_metric.update(targets, outputs)
+        metric_logger.update(model_time=model_time)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+
+    # 同步所有进程中的数据
+    det_metric.synchronize_results()
+    seg_metric.synchronize_results()
+
+    if utils.is_main_process():
+        coco_info = det_metric.evaluate()
+        seg_info = seg_metric.evaluate()
+    else:
+        coco_info = None
+        seg_info = None
+
+    return coco_info, seg_info
diff --git a/pytorch_object_detection/mask_rcnn/transforms.py b/pytorch_object_detection/mask_rcnn/transforms.py
new file mode 100644
index 000000000..6b3abe871
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/transforms.py
@@ -0,0 +1,38 @@
+import random
+from torchvision.transforms import functional as F
+
+
+class Compose(object):
+    """组合多个transform函数"""
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+
+class ToTensor(object):
+    """将PIL图像转为Tensor"""
+    def __call__(self, image, target):
+        image = F.to_tensor(image)
+        return image, target
+
+
+class RandomHorizontalFlip(object):
+    """随机水平翻转图像以及bboxes"""
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            height, width = image.shape[-2:]
+            image = image.flip(-1)  # 水平翻转图片
+            bbox = target["boxes"]
+            # bbox: xmin, ymin, xmax, ymax
+            bbox[:, [0, 2]] = width - bbox[:, [2, 0]]  # 翻转对应bbox坐标信息
+            target["boxes"] = bbox
+            if "masks" in target:
+                target["masks"] = target["masks"].flip(-1)
+        return image, target
diff --git a/pytorch_object_detection/mask_rcnn/validation.py b/pytorch_object_detection/mask_rcnn/validation.py
new file mode 100644
index 000000000..a27288121
--- /dev/null
+++ b/pytorch_object_detection/mask_rcnn/validation.py
@@ -0,0 +1,218 @@
+"""
+该脚本用于调用训练好的模型权重去计算验证集/测试集的COCO指标
+以及每个类别的mAP(IoU=0.5)
+"""
+
+import os
+import json
+
+import torch
+from tqdm import tqdm
+import numpy as np
+
+import transforms
+from backbone import resnet50_fpn_backbone
+from network_files import MaskRCNN
+from my_dataset_coco import CocoDetection
+from my_dataset_voc import VOCInstances
+from train_utils import EvalCOCOMetric
+
+
+def summarize(self, catId=None):
+    """
+    Compute and display summary metrics for evaluation results.
+    Note this functin can *only* be applied on the default parameter setting
+    """
+
+    def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+        p = self.params
+        iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
+        titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+        typeStr = '(AP)' if ap == 1 else '(AR)'
+        iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+            if iouThr is None else '{:0.2f}'.format(iouThr)
+
+        aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+        mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+
+        if ap == 1:
+            # dimension of precision: [TxRxKxAxM]
+            s = self.eval['precision']
+            # IoU
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+
+            if isinstance(catId, int):
+                s = s[:, :, catId, aind, mind]
+            else:
+                s = s[:, :, :, aind, mind]
+
+        else:
+            # dimension of recall: [TxKxAxM]
+            s = self.eval['recall']
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+
+            if isinstance(catId, int):
+                s = s[:, catId, aind, mind]
+            else:
+                s = s[:, :, aind, mind]
+
+        if len(s[s > -1]) == 0:
+            mean_s = -1
+        else:
+            mean_s = np.mean(s[s > -1])
+
+        print_string = iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)
+        return mean_s, print_string
+
+    stats, print_list = [0] * 12, [""] * 12
+    stats[0], print_list[0] = _summarize(1)
+    stats[1], print_list[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
+    stats[2], print_list[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
+    stats[3], print_list[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
+    stats[4], print_list[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
+    stats[5], print_list[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
+    stats[6], print_list[6] = _summarize(0, maxDets=self.params.maxDets[0])
+    stats[7], print_list[7] = _summarize(0, maxDets=self.params.maxDets[1])
+    stats[8], print_list[8] = _summarize(0, maxDets=self.params.maxDets[2])
+    stats[9], print_list[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
+    stats[10], print_list[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
+    stats[11], print_list[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
+
+    print_info = "\n".join(print_list)
+
+    if not self.eval:
+        raise Exception('Please run accumulate() first')
+
+    return stats, print_info
+
+
+def save_info(coco_evaluator,
+              category_index: dict,
+              save_name: str = "record_mAP.txt"):
+    iou_type = coco_evaluator.params.iouType
+    print(f"IoU metric: {iou_type}")
+    # calculate COCO info for all classes
+    coco_stats, print_coco = summarize(coco_evaluator)
+
+    # calculate voc info for every classes(IoU=0.5)
+    classes = [v for v in category_index.values() if v != "N/A"]
+    voc_map_info_list = []
+    for i in range(len(classes)):
+        stats, _ = summarize(coco_evaluator, catId=i)
+        voc_map_info_list.append(" {:15}: {}".format(classes[i], stats[1]))
+
+    print_voc = "\n".join(voc_map_info_list)
+    print(print_voc)
+
+    # 将验证结果保存至txt文件中
+    with open(save_name, "w") as f:
+        record_lines = ["COCO results:",
+                        print_coco,
+                        "",
+                        "mAP(IoU=0.5) for each category:",
+                        print_voc]
+        f.write("\n".join(record_lines))
+
+
+def main(parser_data):
+    device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu")
+    print("Using {} device training.".format(device.type))
+
+    data_transform = {
+        "val": transforms.Compose([transforms.ToTensor()])
+    }
+
+    # read class_indict
+    label_json_path = parser_data.label_json_path
+    assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
+    with open(label_json_path, 'r') as f:
+        category_index = json.load(f)
+
+    data_root = parser_data.data_path
+
+    # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
+    batch_size = parser_data.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using %g dataloader workers' % nw)
+
+    # load validation data set
+    val_dataset = CocoDetection(data_root, "val", data_transform["val"])
+    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
+    # val_dataset = VOCInstances(data_root, year="2012", txt_name="val.txt", transforms=data_transform["val"])
+    val_dataset_loader = torch.utils.data.DataLoader(val_dataset,
+                                                     batch_size=batch_size,
+                                                     shuffle=False,
+                                                     pin_memory=True,
+                                                     num_workers=nw,
+                                                     collate_fn=val_dataset.collate_fn)
+
+    # create model
+    backbone = resnet50_fpn_backbone()
+    model = MaskRCNN(backbone, num_classes=args.num_classes + 1)
+
+    # 载入你自己训练好的模型权重
+    weights_path = parser_data.weights_path
+    assert os.path.exists(weights_path), "not found {} file.".format(weights_path)
+    model.load_state_dict(torch.load(weights_path, map_location='cpu')['model'])
+    # print(model)
+
+    model.to(device)
+
+    # evaluate on the val dataset
+    cpu_device = torch.device("cpu")
+
+    det_metric = EvalCOCOMetric(val_dataset.coco, "bbox", "det_results.json")
+    seg_metric = EvalCOCOMetric(val_dataset.coco, "segm", "seg_results.json")
+    model.eval()
+    with torch.no_grad():
+        for image, targets in tqdm(val_dataset_loader, desc="validation..."):
+            # 将图片传入指定设备device
+            image = list(img.to(device) for img in image)
+
+            # inference
+            outputs = model(image)
+
+            outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
+            det_metric.update(targets, outputs)
+            seg_metric.update(targets, outputs)
+
+    det_metric.synchronize_results()
+    seg_metric.synchronize_results()
+    det_metric.evaluate()
+    seg_metric.evaluate()
+
+    save_info(det_metric.coco_evaluator, category_index, "det_record_mAP.txt")
+    save_info(seg_metric.coco_evaluator, category_index, "seg_record_mAP.txt")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 使用设备类型
+    parser.add_argument('--device', default='cuda', help='device')
+
+    # 检测目标类别数(不包含背景)
+    parser.add_argument('--num-classes', type=int, default=90, help='number of classes')
+
+    # 数据集的根目录
+    parser.add_argument('--data-path', default='/data/coco2017', help='dataset root')
+
+    # 训练好的权重文件
+    parser.add_argument('--weights-path', default='./save_weights/model_25.pth', type=str, help='training weights')
+
+    # batch size(set to 1, don't change)
+    parser.add_argument('--batch-size', default=1, type=int, metavar='N',
+                        help='batch size when validation.')
+    # 类别索引和类别名称对应关系
+    parser.add_argument('--label-json-path', type=str, default="coco91_indices.json")
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/pytorch_object_detection/retinaNet/README.md b/pytorch_object_detection/retinaNet/README.md
index ffd48ecfb..bab2a22f8 100644
--- a/pytorch_object_detection/retinaNet/README.md
+++ b/pytorch_object_detection/retinaNet/README.md
@@ -6,10 +6,10 @@
 ## 环境配置：
 * Python3.6/3.7/3.8
 * Pytorch1.7.1(注意：必须是1.6.0或以上，因为使用官方提供的混合精度训练1.6.0后才支持)
-* pycocotools(Linux:```pip install pycocotools```; Windows:```pip install pycocotools-windows```(不需要额外安装vs))
+* pycocotools(Linux:`pip install pycocotools`; Windows:`pip install pycocotools-windows`(不需要额外安装vs))
 * Ubuntu或Centos(不建议Windows)
 * 最好使用GPU训练
-* 详细环境配置见```requirements.txt```
+* 详细环境配置见`requirements.txt`
 
 ## 文件结构：
 ```
@@ -26,8 +26,8 @@
 
 ## 预训练权重下载地址（下载后放入backbone文件夹中）：
 * ResNet50+FPN backbone: https://download.pytorch.org/models/retinanet_resnet50_fpn_coco-eeacb38b.pth
-* 注意，下载的预训练权重记得要重命名，比如在train.py中读取的是```retinanet_resnet50_fpn_coco.pth```文件，
-  不是```retinanet_resnet50_fpn_coco-eeacb38b.pth```
+* 注意，下载的预训练权重记得要重命名，比如在train.py中读取的是`retinanet_resnet50_fpn_coco.pth`文件，
+  不是`retinanet_resnet50_fpn_coco-eeacb38b.pth`
 
 
 ## 数据集，本例程使用的是PASCAL VOC2012数据集
@@ -54,14 +54,15 @@
 * 确保提前准备好数据集
 * 确保提前下载好对应预训练模型权重
 * 若要单GPU训练，直接使用train.py训练脚本
-* 若要使用多GPU训练，使用```python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py```指令,```nproc_per_node```参数为使用GPU数量
-* 如果想指定使用哪些GPU设备可在指令前加上```CUDA_VISIBLE_DEVICES=0,3```(例如我只要使用设备中的第1块和第4块GPU设备)
-* ```CUDA_VISIBLE_DEVICES=0,3 python -m torch.distributed.launch --nproc_per_node=2 --use_env train_multi_GPU.py```
+* 若要使用多GPU训练，使用`python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py`指令,`nproc_per_node`参数为使用GPU数量
+* 如果想指定使用哪些GPU设备可在指令前加上`CUDA_VISIBLE_DEVICES=0,3`(例如我只要使用设备中的第1块和第4块GPU设备)
+* `CUDA_VISIBLE_DEVICES=0,3 python -m torch.distributed.launch --nproc_per_node=2 --use_env train_multi_GPU.py`
 
 ## 注意事项
-* 在使用训练脚本时，注意要将'--data-path'(VOC_root)设置为自己存放'VOCdevkit'文件夹所在的**根目录**
+* 在使用训练脚本时，注意要将`--data-path`(VOC_root)设置为自己存放`VOCdevkit`文件夹所在的**根目录**
 * 由于带有FPN结构的Faster RCNN很吃显存，如果GPU的显存不够(如果batch_size小于8的话)建议在create_model函数中使用默认的norm_layer，
   即不传递norm_layer变量，默认去使用FrozenBatchNorm2d(即不会去更新参数的bn层),使用中发现效果也很好。
-* 在使用预测脚本时，要将'train_weights'设置为你自己生成的权重路径。
-* 使用validation文件时，注意确保你的验证集或者测试集中必须包含每个类别的目标，并且使用时只需要修改'--num-classes'、'--data-path'和'--weights'即可，其他代码尽量不要改动
+* 训练过程中保存的`results.txt`是每个epoch在验证集上的COCO指标，前12个值是COCO指标，后面两个值是训练平均损失以及学习率
+* 在使用预测脚本时，要将`weights_path`设置为你自己生成的权重路径。
+* 使用validation文件时，注意确保你的验证集或者测试集中必须包含每个类别的目标，并且使用时只需要修改`--num-classes`、`--data-path`和`--weights-path`即可，其他代码尽量不要改动
 
diff --git a/pytorch_object_detection/retinaNet/backbone/feature_pyramid_network.py b/pytorch_object_detection/retinaNet/backbone/feature_pyramid_network.py
index b9f4ea50b..505fbae3b 100644
--- a/pytorch_object_detection/retinaNet/backbone/feature_pyramid_network.py
+++ b/pytorch_object_detection/retinaNet/backbone/feature_pyramid_network.py
@@ -8,6 +8,111 @@
 from torch.jit.annotations import Tuple, List, Dict
 
 
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    Arguments:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+    """
+    __annotations__ = {
+        "return_layers": Dict[str, str],
+    }
+
+    def __init__(self, model, return_layers):
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+
+        # 遍历模型子模块按顺序存入有序字典
+        # 只保存layer4及其之前的结构，舍去之后不用的结构
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        # 依次遍历模型的所有子模块，并进行正向传播，
+        # 收集layer1, layer2, layer3, layer4的输出
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+
+
+class BackboneWithFPN(nn.Module):
+    """
+    Adds a FPN on top of a model.
+    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
+    extract a submodel that returns the feature maps specified in return_layers.
+    The same limitations of IntermediatLayerGetter apply here.
+    Arguments:
+        backbone (nn.Module)
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+        in_channels_list (List[int]): number of channels for each feature map
+            that is returned, in the order they are present in the OrderedDict
+        out_channels (int): number of channels in the FPN.
+        extra_blocks: ExtraFPNBlock
+    Attributes:
+        out_channels (int): the number of channels in the FPN
+    """
+
+    def __init__(self,
+                 backbone: nn.Module,
+                 return_layers=None,
+                 in_channels_list=None,
+                 out_channels=256,
+                 extra_blocks=None,
+                 re_getter=True):
+        super().__init__()
+
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        if re_getter:
+            assert return_layers is not None
+            self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        else:
+            self.body = backbone
+
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list=in_channels_list,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+            )
+
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        x = self.body(x)
+        x = self.fpn(x)
+        return x
+
+
 class ExtraFPNBlock(nn.Module):
     """
     Base class for the extra block in the FPN.
@@ -35,8 +140,7 @@ class LastLevelMaxPool(torch.nn.Module):
     Applies a max_pool2d on top of the last feature map
     """
 
-    def forward(self, x, y, names):
-        # type: (List[Tensor], List[Tensor], List[str]) -> Tuple[List[Tensor], List[str]]
+    def forward(self, x: List[Tensor], y: List[Tensor], names: List[str]) -> Tuple[List[Tensor], List[str]]:
         names.append("pool")
         x.append(F.max_pool2d(x[-1], 1, 2, 0))
         return x, names
@@ -47,7 +151,7 @@ class LastLevelP6P7(ExtraFPNBlock):
     This module is used in RetinaNet to generate extra layers, P6 and P7.
     """
     def __init__(self, in_channels: int, out_channels: int):
-        super(LastLevelP6P7, self).__init__()
+        super().__init__()
         self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
         self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
         for module in [self.p6, self.p7]:
@@ -87,7 +191,7 @@ class FeaturePyramidNetwork(nn.Module):
     """
 
     def __init__(self, in_channels_list, out_channels, extra_blocks=None):
-        super(FeaturePyramidNetwork, self).__init__()
+        super().__init__()
         # 用来调整resnet特征矩阵(layer1,2,3,4)的channel（kernel_size=1）
         self.inner_blocks = nn.ModuleList()
         # 对调整后的特征矩阵使用3x3的卷积核来得到对应的预测特征矩阵
@@ -108,8 +212,7 @@ def __init__(self, in_channels_list, out_channels, extra_blocks=None):
 
         self.extra_blocks = extra_blocks
 
-    def get_result_from_inner_blocks(self, x, idx):
-        # type: (Tensor, int) -> Tensor
+    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
         """
         This is equivalent to self.inner_blocks[idx](x),
         but torchscript doesn't support this yet
@@ -125,8 +228,7 @@ def get_result_from_inner_blocks(self, x, idx):
             i += 1
         return out
 
-    def get_result_from_layer_blocks(self, x, idx):
-        # type: (Tensor, int) -> Tensor
+    def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
         """
         This is equivalent to self.layer_blocks[idx](x),
         but torchscript doesn't support this yet
@@ -142,8 +244,7 @@ def get_result_from_layer_blocks(self, x, idx):
             i += 1
         return out
 
-    def forward(self, x):
-        # type: (Dict[str, Tensor]) -> Dict[str, Tensor]
+    def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """
         Computes the FPN for a set of feature maps.
         Arguments:
diff --git a/pytorch_object_detection/retinaNet/backbone/resnet50_fpn_model.py b/pytorch_object_detection/retinaNet/backbone/resnet50_fpn_model.py
index 553f8aac8..451bf5649 100644
--- a/pytorch_object_detection/retinaNet/backbone/resnet50_fpn_model.py
+++ b/pytorch_object_detection/retinaNet/backbone/resnet50_fpn_model.py
@@ -1,19 +1,17 @@
 import os
-from collections import OrderedDict
 
 import torch.nn as nn
 import torch
-from torch.jit.annotations import List, Dict
 from torchvision.ops.misc import FrozenBatchNorm2d
 
-from .feature_pyramid_network import LastLevelMaxPool, FeaturePyramidNetwork
+from .feature_pyramid_network import LastLevelMaxPool, BackboneWithFPN
 
 
 class Bottleneck(nn.Module):
     expansion = 4
 
     def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None):
-        super(Bottleneck, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
 
@@ -56,7 +54,7 @@ def forward(self, x):
 class ResNet(nn.Module):
 
     def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None):
-        super(ResNet, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         self._norm_layer = norm_layer
@@ -136,100 +134,6 @@ def overwrite_eps(model, eps):
             module.eps = eps
 
 
-class IntermediateLayerGetter(nn.ModuleDict):
-    """
-    Module wrapper that returns intermediate layers from a model
-    It has a strong assumption that the modules have been registered
-    into the model in the same order as they are used.
-    This means that one should **not** reuse the same nn.Module
-    twice in the forward if you want this to work.
-    Additionally, it is only able to query submodules that are directly
-    assigned to the model. So if `model` is passed, `model.feature1` can
-    be returned, but not `model.feature1.layer2`.
-    Arguments:
-        model (nn.Module): model on which we will extract the features
-        return_layers (Dict[name, new_name]): a dict containing the names
-            of the modules for which the activations will be returned as
-            the key of the dict, and the value of the dict is the name
-            of the returned activation (which the user can specify).
-    """
-    __annotations__ = {
-        "return_layers": Dict[str, str],
-    }
-
-    def __init__(self, model, return_layers):
-        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
-            raise ValueError("return_layers are not present in model")
-
-        orig_return_layers = return_layers
-        return_layers = {str(k): str(v) for k, v in return_layers.items()}
-        layers = OrderedDict()
-
-        # 遍历模型子模块按顺序存入有序字典
-        # 只保存layer4及其之前的结构，舍去之后不用的结构
-        for name, module in model.named_children():
-            layers[name] = module
-            if name in return_layers:
-                del return_layers[name]
-            if not return_layers:
-                break
-
-        super(IntermediateLayerGetter, self).__init__(layers)
-        self.return_layers = orig_return_layers
-
-    def forward(self, x):
-        out = OrderedDict()
-        # 依次遍历模型的所有子模块，并进行正向传播，
-        # 收集layer1, layer2, layer3, layer4的输出
-        for name, module in self.items():
-            x = module(x)
-            if name in self.return_layers:
-                out_name = self.return_layers[name]
-                out[out_name] = x
-        return out
-
-
-class BackboneWithFPN(nn.Module):
-    """
-    Adds a FPN on top of a model.
-    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
-    extract a submodel that returns the feature maps specified in return_layers.
-    The same limitations of IntermediatLayerGetter apply here.
-    Arguments:
-        backbone (nn.Module)
-        return_layers (Dict[name, new_name]): a dict containing the names
-            of the modules for which the activations will be returned as
-            the key of the dict, and the value of the dict is the name
-            of the returned activation (which the user can specify).
-        in_channels_list (List[int]): number of channels for each feature map
-            that is returned, in the order they are present in the OrderedDict
-        out_channels (int): number of channels in the FPN.
-        extra_blocks: ExtraFPNBlock
-    Attributes:
-        out_channels (int): the number of channels in the FPN
-    """
-
-    def __init__(self, backbone, return_layers, in_channels_list, out_channels, extra_blocks=None):
-        super(BackboneWithFPN, self).__init__()
-
-        if extra_blocks is None:
-            extra_blocks = LastLevelMaxPool()
-
-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
-        self.fpn = FeaturePyramidNetwork(
-            in_channels_list=in_channels_list,
-            out_channels=out_channels,
-            extra_blocks=extra_blocks,
-            )
-
-        self.out_channels = out_channels
-
-    def forward(self, x):
-        x = self.body(x)
-        x = self.fpn(x)
-        return x
-
-
 def resnet50_fpn_backbone(pretrain_path="",
                           norm_layer=FrozenBatchNorm2d,   # FrozenBatchNorm2d的功能与BatchNorm2d类似，但参数无法更新
                           trainable_layers=3,
diff --git a/pytorch_object_detection/retinaNet/draw_box_utils.py b/pytorch_object_detection/retinaNet/draw_box_utils.py
index 1a2926583..835d7f7c1 100644
--- a/pytorch_object_detection/retinaNet/draw_box_utils.py
+++ b/pytorch_object_detection/retinaNet/draw_box_utils.py
@@ -1,6 +1,7 @@
-import collections
+from PIL.Image import Image, fromarray
 import PIL.ImageDraw as ImageDraw
 import PIL.ImageFont as ImageFont
+from PIL import ImageColor
 import numpy as np
 
 STANDARD_COLORS = [
@@ -30,66 +31,123 @@
 ]
 
 
-def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):
-    for i in range(boxes.shape[0]):
-        if scores[i] > thresh:
-            box = tuple(boxes[i].tolist())  # numpy -> list -> tuple
-            if classes[i] in category_index.keys():
-                class_name = category_index[classes[i]]
-            else:
-                class_name = 'N/A'
-            display_str = str(class_name)
-            display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))
-            box_to_display_str_map[box].append(display_str)
-            box_to_color_map[box] = STANDARD_COLORS[
-                classes[i] % len(STANDARD_COLORS)]
-        else:
-            break  # 网络输出概率已经排序过，当遇到一个不满足后面的肯定不满足
-
-
-def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):
+def draw_text(draw,
+              box: list,
+              cls: int,
+              score: float,
+              category_index: dict,
+              color: str,
+              font: str = 'arial.ttf',
+              font_size: int = 24):
+    """
+    将目标边界框和类别信息绘制到图片上
+    """
     try:
-        font = ImageFont.truetype('arial.ttf', 24)
+        font = ImageFont.truetype(font, font_size)
     except IOError:
         font = ImageFont.load_default()
 
+    left, top, right, bottom = box
     # If the total height of the display strings added to the top of the bounding
     # box exceeds the top of the image, stack the strings below the bounding box
     # instead of above.
-    display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]
+    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
+    display_str_heights = [font.getsize(ds)[1] for ds in display_str]
     # Each display_str has a top and bottom margin of 0.05x.
-    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
+    display_str_height = (1 + 2 * 0.05) * max(display_str_heights)
 
-    if top > total_display_str_height:
+    if top > display_str_height:
+        text_top = top - display_str_height
         text_bottom = top
     else:
-        text_bottom = bottom + total_display_str_height
-    # Reverse list and print from bottom to top.
-    for display_str in box_to_display_str_map[box][::-1]:
-        text_width, text_height = font.getsize(display_str)
-        margin = np.ceil(0.05 * text_height)
-        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
-                        (left + text_width, text_bottom)], fill=color)
-        draw.text((left + margin, text_bottom - text_height - margin),
-                  display_str,
+        text_top = bottom
+        text_bottom = bottom + display_str_height
+
+    for ds in display_str:
+        text_width, text_height = font.getsize(ds)
+        margin = np.ceil(0.05 * text_width)
+        draw.rectangle([(left, text_top),
+                        (left + text_width + 2 * margin, text_bottom)], fill=color)
+        draw.text((left + margin, text_top),
+                  ds,
                   fill='black',
                   font=font)
-        text_bottom -= text_height - 2 * margin
+        left += text_width
+
+
+def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
+    np_image = np.array(image)
+    masks = np.where(masks > thresh, True, False)
+
+    # colors = np.array(colors)
+    img_to_draw = np.copy(np_image)
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors):
+        img_to_draw[mask] = color
+
+    out = np_image * (1 - alpha) + img_to_draw * alpha
+    return fromarray(out.astype(np.uint8))
+
+
+def draw_objs(image: Image,
+              boxes: np.ndarray = None,
+              classes: np.ndarray = None,
+              scores: np.ndarray = None,
+              masks: np.ndarray = None,
+              category_index: dict = None,
+              box_thresh: float = 0.1,
+              mask_thresh: float = 0.5,
+              line_thickness: int = 8,
+              font: str = 'arial.ttf',
+              font_size: int = 24,
+              draw_boxes_on_image: bool = True,
+              draw_masks_on_image: bool = False):
+    """
+    将目标边界框信息，类别信息，mask信息绘制在图片上
+    Args:
+        image: 需要绘制的图片
+        boxes: 目标边界框信息
+        classes: 目标类别信息
+        scores: 目标概率信息
+        masks: 目标mask信息
+        category_index: 类别与名称字典
+        box_thresh: 过滤的概率阈值
+        mask_thresh:
+        line_thickness: 边界框宽度
+        font: 字体类型
+        font_size: 字体大小
+        draw_boxes_on_image:
+        draw_masks_on_image:
+
+    Returns:
+
+    """
+
+    # 过滤掉低概率的目标
+    idxs = np.greater(scores, box_thresh)
+    boxes = boxes[idxs]
+    classes = classes[idxs]
+    scores = scores[idxs]
+    if masks is not None:
+        masks = masks[idxs]
+    if len(boxes) == 0:
+        return image
 
+    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
 
-def draw_box(image, boxes, classes, scores, category_index, thresh=0.5, line_thickness=8):
-    box_to_display_str_map = collections.defaultdict(list)
-    box_to_color_map = collections.defaultdict(str)
+    if draw_boxes_on_image:
+        # Draw all boxes onto image.
+        draw = ImageDraw.Draw(image)
+        for box, cls, score, color in zip(boxes, classes, scores, colors):
+            left, top, right, bottom = box
+            # 绘制目标边界框
+            draw.line([(left, top), (left, bottom), (right, bottom),
+                       (right, top), (left, top)], width=line_thickness, fill=color)
+            # 绘制类别和概率信息
+            draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
 
-    filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
+    if draw_masks_on_image and (masks is not None):
+        # Draw all mask onto image.
+        image = draw_masks(image, masks, colors, mask_thresh)
 
-    # Draw all boxes onto image.
-    draw = ImageDraw.Draw(image)
-    im_width, im_height = image.size
-    for box, color in box_to_color_map.items():
-        xmin, ymin, xmax, ymax = box
-        (left, right, top, bottom) = (xmin * 1, xmax * 1,
-                                      ymin * 1, ymax * 1)
-        draw.line([(left, top), (left, bottom), (right, bottom),
-                   (right, top), (left, top)], width=line_thickness, fill=color)
-        draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
+    return image
diff --git a/pytorch_object_detection/retinaNet/my_dataset.py b/pytorch_object_detection/retinaNet/my_dataset.py
index 5a8a4e93a..3dc863bc0 100644
--- a/pytorch_object_detection/retinaNet/my_dataset.py
+++ b/pytorch_object_detection/retinaNet/my_dataset.py
@@ -11,7 +11,11 @@ class VOCDataSet(Dataset):
 
     def __init__(self, voc_root, year="2012", transforms=None, txt_name: str = "train.txt"):
         assert year in ["2007", "2012"], "year must be in ['2007', '2012']"
-        self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
+        # 增加容错能力
+        if "VOCdevkit" in voc_root:
+            self.root = os.path.join(voc_root, f"VOC{year}")
+        else:
+            self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
         self.img_root = os.path.join(self.root, "JPEGImages")
         self.annotations_root = os.path.join(self.root, "Annotations")
 
@@ -31,9 +35,8 @@ def __init__(self, voc_root, year="2012", transforms=None, txt_name: str = "trai
         # read class_indict
         json_file = './pascal_voc_classes.json'
         assert os.path.exists(json_file), "{} file not exist.".format(json_file)
-        json_file = open(json_file, 'r')
-        self.class_dict = json.load(json_file)
-        json_file.close()
+        with open(json_file, 'r') as f:
+            self.class_dict = json.load(f)
 
         self.transforms = transforms
 
@@ -181,7 +184,7 @@ def collate_fn(batch):
         return tuple(zip(*batch))
 
 # import transforms
-# from draw_box_utils import draw_box
+# from draw_box_utils import draw_objs
 # from PIL import Image
 # import json
 # import matplotlib.pyplot as plt
@@ -193,7 +196,7 @@ def collate_fn(batch):
 # try:
 #     json_file = open('./pascal_voc_classes.json', 'r')
 #     class_dict = json.load(json_file)
-#     category_index = {v: k for k, v in class_dict.items()}
+#     category_index = {str(v): str(k) for k, v in class_dict.items()}
 # except Exception as e:
 #     print(e)
 #     exit(-1)
@@ -210,12 +213,14 @@ def collate_fn(batch):
 # for index in random.sample(range(0, len(train_data_set)), k=5):
 #     img, target = train_data_set[index]
 #     img = ts.ToPILImage()(img)
-#     draw_box(img,
-#              target["boxes"].numpy(),
-#              target["labels"].numpy(),
-#              [1 for i in range(len(target["labels"].numpy()))],
-#              category_index,
-#              thresh=0.5,
-#              line_thickness=5)
-#     plt.imshow(img)
+#     plot_img = draw_objs(img,
+#                          target["boxes"].numpy(),
+#                          target["labels"].numpy(),
+#                          np.ones(target["labels"].shape[0]),
+#                          category_index=category_index,
+#                          box_thresh=0.5,
+#                          line_thickness=3,
+#                          font='arial.ttf',
+#                          font_size=20)
+#     plt.imshow(plot_img)
 #     plt.show()
diff --git a/pytorch_object_detection/retinaNet/network_files/boxes.py b/pytorch_object_detection/retinaNet/network_files/boxes.py
index f720df1f8..8eeca4573 100644
--- a/pytorch_object_detection/retinaNet/network_files/boxes.py
+++ b/pytorch_object_detection/retinaNet/network_files/boxes.py
@@ -23,7 +23,7 @@ def nms(boxes, scores, iou_threshold):
         scores for each one of the boxes
     iou_threshold : float
         discards all overlapping
-        boxes with IoU < iou_threshold
+        boxes with IoU > iou_threshold
 
     Returns
     -------
diff --git a/pytorch_object_detection/retinaNet/predict.py b/pytorch_object_detection/retinaNet/predict.py
index 47ed83008..954fd336e 100644
--- a/pytorch_object_detection/retinaNet/predict.py
+++ b/pytorch_object_detection/retinaNet/predict.py
@@ -9,7 +9,7 @@
 from torchvision import transforms
 from network_files import RetinaNet
 from backbone import resnet50_fpn_backbone, LastLevelP6P7
-from draw_box_utils import draw_box
+from draw_box_utils import draw_objs
 
 
 def create_model(num_classes):
@@ -38,18 +38,20 @@ def main():
     model = create_model(num_classes=20)
 
     # load train weights
-    train_weights = "./save_weights/model.pth"
-    assert os.path.exists(train_weights), "{} file dose not exist.".format(train_weights)
-    model.load_state_dict(torch.load(train_weights, map_location=device)["model"])
+    weights_path = "./save_weights/model.pth"
+    assert os.path.exists(weights_path), "{} file dose not exist.".format(weights_path)
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     model.to(device)
 
     # read class_indict
     label_json_path = './pascal_voc_classes.json'
     assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
-    json_file = open(label_json_path, 'r')
-    class_dict = json.load(json_file)
-    json_file.close()
-    category_index = {v: k for k, v in class_dict.items()}
+    with open(label_json_path, 'r') as f:
+        class_dict = json.load(f)
+
+    category_index = {str(v): str(k) for k, v in class_dict.items()}
 
     # load image
     original_img = Image.open("./test.jpg")
@@ -79,17 +81,19 @@ def main():
         if len(predict_boxes) == 0:
             print("没有检测到任何目标!")
 
-        draw_box(original_img,
-                 predict_boxes,
-                 predict_classes,
-                 predict_scores,
-                 category_index,
-                 thresh=0.4,
-                 line_thickness=3)
-        plt.imshow(original_img)
+        plot_img = draw_objs(original_img,
+                             predict_boxes,
+                             predict_classes,
+                             predict_scores,
+                             category_index=category_index,
+                             box_thresh=0.5,
+                             line_thickness=3,
+                             font='arial.ttf',
+                             font_size=20)
+        plt.imshow(plot_img)
         plt.show()
         # 保存预测的图片结果
-        original_img.save("test_result.jpg")
+        plot_img.save("test_result.jpg")
 
 
 if __name__ == '__main__':
diff --git a/pytorch_object_detection/retinaNet/requirements.txt b/pytorch_object_detection/retinaNet/requirements.txt
index b5854c8d5..846ad37de 100644
--- a/pytorch_object_detection/retinaNet/requirements.txt
+++ b/pytorch_object_detection/retinaNet/requirements.txt
@@ -1,6 +1,6 @@
 lxml
 matplotlib
-nump
+numpy
 tqdm
 torch==1.7.1
 torchvision==0.8.2
diff --git a/pytorch_object_detection/retinaNet/train.py b/pytorch_object_detection/retinaNet/train.py
index bded930ff..314bad117 100644
--- a/pytorch_object_detection/retinaNet/train.py
+++ b/pytorch_object_detection/retinaNet/train.py
@@ -35,8 +35,8 @@ def create_model(num_classes):
     return model
 
 
-def main(parser_data):
-    device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu")
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
     print("Using {} device training.".format(device.type))
 
     results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
@@ -47,7 +47,7 @@ def main(parser_data):
         "val": transforms.Compose([transforms.ToTensor()])
     }
 
-    VOC_root = parser_data.data_path
+    VOC_root = args.data_path
     # check voc root
     if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False:
         raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root))
@@ -67,7 +67,7 @@ def main(parser_data):
         train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
 
     # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
-    batch_size = parser_data.batch_size
+    batch_size = args.batch_size
     nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
     print('Using %g dataloader workers' % nw)
     if train_sampler:
@@ -97,7 +97,7 @@ def main(parser_data):
 
     # create model
     # 注意：不包含背景
-    model = create_model(num_classes=parser_data.num_classes)
+    model = create_model(num_classes=args.num_classes)
     # print(model)
 
     model.to(device)
@@ -115,21 +115,21 @@ def main(parser_data):
                                                    gamma=0.33)
 
     # 如果指定了上次训练保存的权重文件地址，则接着上次结果接着训练
-    if parser_data.resume != "":
-        checkpoint = torch.load(parser_data.resume, map_location='cpu')
+    if args.resume != "":
+        checkpoint = torch.load(args.resume, map_location='cpu')
         model.load_state_dict(checkpoint['model'])
         optimizer.load_state_dict(checkpoint['optimizer'])
         lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
-        parser_data.start_epoch = checkpoint['epoch'] + 1
+        args.start_epoch = checkpoint['epoch'] + 1
         if args.amp and "scaler" in checkpoint:
             scaler.load_state_dict(checkpoint["scaler"])
-        print("the training process from epoch{}...".format(parser_data.start_epoch))
+        print("the training process from epoch{}...".format(args.start_epoch))
 
     train_loss = []
     learning_rate = []
     val_map = []
 
-    for epoch in range(parser_data.start_epoch, parser_data.epochs):
+    for epoch in range(args.start_epoch, args.epochs):
         # train for one epoch, printing every 10 iterations
         mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
                                               device, epoch, print_freq=50,
@@ -146,7 +146,7 @@ def main(parser_data):
         # write into txt
         with open(results_file, "a") as f:
             # 写入的数据包括coco指标还有loss和learning rate
-            result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
             txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
             f.write(txt + "\n")
 
diff --git a/pytorch_object_detection/retinaNet/train_multi_GPU.py b/pytorch_object_detection/retinaNet/train_multi_GPU.py
index 047a64eb2..35ed8fc77 100644
--- a/pytorch_object_detection/retinaNet/train_multi_GPU.py
+++ b/pytorch_object_detection/retinaNet/train_multi_GPU.py
@@ -156,7 +156,7 @@ def main(args):
             # write into txt
             with open(results_file, "a") as f:
                 # 写入的数据包括coco指标还有loss和learning rate
-                result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
+                result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
                 txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
                 f.write(txt + "\n")
 
diff --git a/pytorch_object_detection/retinaNet/validation.py b/pytorch_object_detection/retinaNet/validation.py
index ffd320443..cc2826763 100644
--- a/pytorch_object_detection/retinaNet/validation.py
+++ b/pytorch_object_detection/retinaNet/validation.py
@@ -100,9 +100,9 @@ def main(parser_data):
     # read class_indict
     label_json_path = './pascal_voc_classes.json'
     assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
-    json_file = open(label_json_path, 'r')
-    class_dict = json.load(json_file)
-    json_file.close()
+    with open(label_json_path, 'r') as f:
+        class_dict = json.load(f)
+
     category_index = {v: k for k, v in class_dict.items()}
 
     VOC_root = parser_data.data_path
@@ -132,9 +132,11 @@ def main(parser_data):
     model = RetinaNet(backbone, parser_data.num_classes)
 
     # 载入你自己训练好的模型权重
-    weights_path = parser_data.weights
+    weights_path = parser_data.weights_path
     assert os.path.exists(weights_path), "not found {} file.".format(weights_path)
-    model.load_state_dict(torch.load(weights_path, map_location=device)['model'])
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     # print(model)
 
     model.to(device)
@@ -203,7 +205,7 @@ def main(parser_data):
     parser.add_argument('--data-path', default='/data', help='dataset root')
 
     # 训练好的权重文件
-    parser.add_argument('--weights', default='./save_weights/model.pth', type=str, help='training weights')
+    parser.add_argument('--weights-path', default='./save_weights/model.pth', type=str, help='training weights')
 
     # batch size
     parser.add_argument('--batch_size', default=1, type=int, metavar='N',
diff --git a/pytorch_object_detection/ssd/README.md b/pytorch_object_detection/ssd/README.md
index be7f2f435..ab51771ab 100644
--- a/pytorch_object_detection/ssd/README.md
+++ b/pytorch_object_detection/ssd/README.md
@@ -38,6 +38,7 @@
 * 确保提前下载好对应预训练模型权重
 * 单GPU训练或CPU，直接使用train_ssd300.py训练脚本
 * 若要使用多GPU训练，使用 "python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py" 指令,nproc_per_node参数为使用GPU数量
+* 训练过程中保存的`results.txt`是每个epoch在验证集上的COCO指标，前12个值是COCO指标，后面两个值是训练平均损失以及学习率
 
 ## 如果对SSD算法原理不是很理解可参考我的bilibili
 * https://www.bilibili.com/video/BV1fT4y1L7Gi
diff --git a/pytorch_object_detection/ssd/draw_box_utils.py b/pytorch_object_detection/ssd/draw_box_utils.py
index 1a2926583..835d7f7c1 100644
--- a/pytorch_object_detection/ssd/draw_box_utils.py
+++ b/pytorch_object_detection/ssd/draw_box_utils.py
@@ -1,6 +1,7 @@
-import collections
+from PIL.Image import Image, fromarray
 import PIL.ImageDraw as ImageDraw
 import PIL.ImageFont as ImageFont
+from PIL import ImageColor
 import numpy as np
 
 STANDARD_COLORS = [
@@ -30,66 +31,123 @@
 ]
 
 
-def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):
-    for i in range(boxes.shape[0]):
-        if scores[i] > thresh:
-            box = tuple(boxes[i].tolist())  # numpy -> list -> tuple
-            if classes[i] in category_index.keys():
-                class_name = category_index[classes[i]]
-            else:
-                class_name = 'N/A'
-            display_str = str(class_name)
-            display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))
-            box_to_display_str_map[box].append(display_str)
-            box_to_color_map[box] = STANDARD_COLORS[
-                classes[i] % len(STANDARD_COLORS)]
-        else:
-            break  # 网络输出概率已经排序过，当遇到一个不满足后面的肯定不满足
-
-
-def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):
+def draw_text(draw,
+              box: list,
+              cls: int,
+              score: float,
+              category_index: dict,
+              color: str,
+              font: str = 'arial.ttf',
+              font_size: int = 24):
+    """
+    将目标边界框和类别信息绘制到图片上
+    """
     try:
-        font = ImageFont.truetype('arial.ttf', 24)
+        font = ImageFont.truetype(font, font_size)
     except IOError:
         font = ImageFont.load_default()
 
+    left, top, right, bottom = box
     # If the total height of the display strings added to the top of the bounding
     # box exceeds the top of the image, stack the strings below the bounding box
     # instead of above.
-    display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]
+    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
+    display_str_heights = [font.getsize(ds)[1] for ds in display_str]
     # Each display_str has a top and bottom margin of 0.05x.
-    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
+    display_str_height = (1 + 2 * 0.05) * max(display_str_heights)
 
-    if top > total_display_str_height:
+    if top > display_str_height:
+        text_top = top - display_str_height
         text_bottom = top
     else:
-        text_bottom = bottom + total_display_str_height
-    # Reverse list and print from bottom to top.
-    for display_str in box_to_display_str_map[box][::-1]:
-        text_width, text_height = font.getsize(display_str)
-        margin = np.ceil(0.05 * text_height)
-        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
-                        (left + text_width, text_bottom)], fill=color)
-        draw.text((left + margin, text_bottom - text_height - margin),
-                  display_str,
+        text_top = bottom
+        text_bottom = bottom + display_str_height
+
+    for ds in display_str:
+        text_width, text_height = font.getsize(ds)
+        margin = np.ceil(0.05 * text_width)
+        draw.rectangle([(left, text_top),
+                        (left + text_width + 2 * margin, text_bottom)], fill=color)
+        draw.text((left + margin, text_top),
+                  ds,
                   fill='black',
                   font=font)
-        text_bottom -= text_height - 2 * margin
+        left += text_width
+
+
+def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
+    np_image = np.array(image)
+    masks = np.where(masks > thresh, True, False)
+
+    # colors = np.array(colors)
+    img_to_draw = np.copy(np_image)
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors):
+        img_to_draw[mask] = color
+
+    out = np_image * (1 - alpha) + img_to_draw * alpha
+    return fromarray(out.astype(np.uint8))
+
+
+def draw_objs(image: Image,
+              boxes: np.ndarray = None,
+              classes: np.ndarray = None,
+              scores: np.ndarray = None,
+              masks: np.ndarray = None,
+              category_index: dict = None,
+              box_thresh: float = 0.1,
+              mask_thresh: float = 0.5,
+              line_thickness: int = 8,
+              font: str = 'arial.ttf',
+              font_size: int = 24,
+              draw_boxes_on_image: bool = True,
+              draw_masks_on_image: bool = False):
+    """
+    将目标边界框信息，类别信息，mask信息绘制在图片上
+    Args:
+        image: 需要绘制的图片
+        boxes: 目标边界框信息
+        classes: 目标类别信息
+        scores: 目标概率信息
+        masks: 目标mask信息
+        category_index: 类别与名称字典
+        box_thresh: 过滤的概率阈值
+        mask_thresh:
+        line_thickness: 边界框宽度
+        font: 字体类型
+        font_size: 字体大小
+        draw_boxes_on_image:
+        draw_masks_on_image:
+
+    Returns:
+
+    """
+
+    # 过滤掉低概率的目标
+    idxs = np.greater(scores, box_thresh)
+    boxes = boxes[idxs]
+    classes = classes[idxs]
+    scores = scores[idxs]
+    if masks is not None:
+        masks = masks[idxs]
+    if len(boxes) == 0:
+        return image
 
+    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
 
-def draw_box(image, boxes, classes, scores, category_index, thresh=0.5, line_thickness=8):
-    box_to_display_str_map = collections.defaultdict(list)
-    box_to_color_map = collections.defaultdict(str)
+    if draw_boxes_on_image:
+        # Draw all boxes onto image.
+        draw = ImageDraw.Draw(image)
+        for box, cls, score, color in zip(boxes, classes, scores, colors):
+            left, top, right, bottom = box
+            # 绘制目标边界框
+            draw.line([(left, top), (left, bottom), (right, bottom),
+                       (right, top), (left, top)], width=line_thickness, fill=color)
+            # 绘制类别和概率信息
+            draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
 
-    filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
+    if draw_masks_on_image and (masks is not None):
+        # Draw all mask onto image.
+        image = draw_masks(image, masks, colors, mask_thresh)
 
-    # Draw all boxes onto image.
-    draw = ImageDraw.Draw(image)
-    im_width, im_height = image.size
-    for box, color in box_to_color_map.items():
-        xmin, ymin, xmax, ymax = box
-        (left, right, top, bottom) = (xmin * 1, xmax * 1,
-                                      ymin * 1, ymax * 1)
-        draw.line([(left, top), (left, bottom), (right, bottom),
-                   (right, top), (left, top)], width=line_thickness, fill=color)
-        draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
+    return image
diff --git a/pytorch_object_detection/ssd/my_dataset.py b/pytorch_object_detection/ssd/my_dataset.py
index 23bce7430..ebea5635f 100644
--- a/pytorch_object_detection/ssd/my_dataset.py
+++ b/pytorch_object_detection/ssd/my_dataset.py
@@ -11,7 +11,11 @@ class VOCDataSet(Dataset):
 
     def __init__(self, voc_root, year="2012", transforms=None, train_set='train.txt'):
         assert year in ["2007", "2012"], "year must be in ['2007', '2012']"
-        self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
+        # 增加容错能力
+        if "VOCdevkit" in voc_root:
+            self.root = os.path.join(voc_root, f"VOC{year}")
+        else:
+            self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
         self.img_root = os.path.join(self.root, "JPEGImages")
         self.annotations_root = os.path.join(self.root, "Annotations")
 
@@ -24,9 +28,8 @@ def __init__(self, voc_root, year="2012", transforms=None, train_set='train.txt'
         # read class_indict
         json_file = "./pascal_voc_classes.json"
         assert os.path.exists(json_file), "{} file not exist.".format(json_file)
-        json_file = open(json_file, 'r')
-        self.class_dict = json.load(json_file)
-        json_file.close()
+        with open(json_file, 'r') as f:
+            self.class_dict = json.load(f)
 
         self.transforms = transforms
 
@@ -198,7 +201,7 @@ def collate_fn(batch):
         return images, targets
 
 # import transforms
-# from draw_box_utils import draw_box
+# from draw_box_utils import draw_objs
 # from PIL import Image
 # import json
 # import matplotlib.pyplot as plt
@@ -210,7 +213,7 @@ def collate_fn(batch):
 # try:
 #     json_file = open('./pascal_voc_classes.json', 'r')
 #     class_dict = json.load(json_file)
-#     category_index = {v: k for k, v in class_dict.items()}
+#     category_index = {str(v): str(k) for k, v in class_dict.items()}
 # except Exception as e:
 #     print(e)
 #     exit(-1)
@@ -227,12 +230,14 @@ def collate_fn(batch):
 # for index in random.sample(range(0, len(train_data_set)), k=5):
 #     img, target = train_data_set[index]
 #     img = ts.ToPILImage()(img)
-#     draw_box(img,
-#              target["boxes"].numpy(),
-#              target["labels"].numpy(),
-#              [1 for i in range(len(target["labels"].numpy()))],
-#              category_index,
-#              thresh=0.5,
-#              line_thickness=5)
-#     plt.imshow(img)
+#     plot_img = draw_objs(img,
+#                          target["boxes"].numpy(),
+#                          target["labels"].numpy(),
+#                          np.ones(target["labels"].shape[0]),
+#                          category_index=category_index,
+#                          box_thresh=0.5,
+#                          line_thickness=3,
+#                          font='arial.ttf',
+#                          font_size=20)
+#     plt.imshow(plot_img)
 #     plt.show()
diff --git a/pytorch_object_detection/ssd/predict_test.py b/pytorch_object_detection/ssd/predict_test.py
index dee265c49..ea8e8eeef 100644
--- a/pytorch_object_detection/ssd/predict_test.py
+++ b/pytorch_object_detection/ssd/predict_test.py
@@ -8,7 +8,7 @@
 
 import transforms
 from src import SSD300, Backbone
-from draw_box_utils import draw_box
+from draw_box_utils import draw_objs
 
 
 def create_model(num_classes):
@@ -34,8 +34,10 @@ def main():
     model = create_model(num_classes=num_classes)
 
     # load train weights
-    train_weights = "./save_weights/ssd300-14.pth"
-    model.load_state_dict(torch.load(train_weights, map_location=device)['model'])
+    weights_path = "./save_weights/ssd300-14.pth"
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     model.to(device)
 
     # read class_indict
@@ -44,7 +46,7 @@ def main():
     json_file = open(json_path, 'r')
     class_dict = json.load(json_file)
     json_file.close()
-    category_index = {v: k for k, v in class_dict.items()}
+    category_index = {str(v): str(k) for k, v in class_dict.items()}
 
     # load image
     original_img = Image.open("./test.jpg")
@@ -77,15 +79,19 @@ def main():
         if len(predict_boxes) == 0:
             print("没有检测到任何目标!")
 
-        draw_box(original_img,
-                 predict_boxes,
-                 predict_classes,
-                 predict_scores,
-                 category_index,
-                 thresh=0.5,
-                 line_thickness=5)
-        plt.imshow(original_img)
+        plot_img = draw_objs(original_img,
+                             predict_boxes,
+                             predict_classes,
+                             predict_scores,
+                             category_index=category_index,
+                             box_thresh=0.5,
+                             line_thickness=3,
+                             font='arial.ttf',
+                             font_size=20)
+        plt.imshow(plot_img)
         plt.show()
+        # 保存预测的图片结果
+        plot_img.save("test_result.jpg")
 
 
 if __name__ == "__main__":
diff --git a/pytorch_object_detection/ssd/validation.py b/pytorch_object_detection/ssd/validation.py
index aed5e55fc..4cda72ab3 100644
--- a/pytorch_object_detection/ssd/validation.py
+++ b/pytorch_object_detection/ssd/validation.py
@@ -101,9 +101,9 @@ def main(parser_data):
     # read class_indict
     label_json_path = './pascal_voc_classes.json'
     assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
-    json_file = open(label_json_path, 'r')
-    class_dict = json.load(json_file)
-    json_file.close()
+    with open(label_json_path, 'r') as f:
+        class_dict = json.load(f)
+
     category_index = {v: k for k, v in class_dict.items()}
 
     VOC_root = parser_data.data_path
@@ -133,7 +133,9 @@ def main(parser_data):
     # 载入你自己训练好的模型权重
     weights_path = parser_data.weights
     assert os.path.exists(weights_path), "not found {} file.".format(weights_path)
-    model.load_state_dict(torch.load(weights_path, map_location=device)['model'])
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     # print(model)
 
     model.to(device)
diff --git a/pytorch_object_detection/train_coco_dataset/README.md b/pytorch_object_detection/train_coco_dataset/README.md
index db98ce758..eb516bc8b 100644
--- a/pytorch_object_detection/train_coco_dataset/README.md
+++ b/pytorch_object_detection/train_coco_dataset/README.md
@@ -5,7 +5,7 @@
 
 ## 环境配置：
 * Python3.6/3.7/3.8
-* Pytorch1.7.1
+* Pytorch1.10.0
 * pycocotools(Linux:```pip install pycocotools```; Windows:```pip install pycocotools-windows```(不需要额外安装vs))
 * Ubuntu或Centos(不建议Windows)
 * 最好使用GPU训练
@@ -17,28 +17,27 @@
   ├── network_files: Faster R-CNN网络（包括Fast R-CNN以及RPN等模块）
   ├── train_utils: 训练验证相关模块（包括pycocotools）
   ├── my_dataset.py: 自定义dataset用于读取COCO2017数据集
-  ├── train.py: 以VGG16做为backbone进行训练
+  ├── train.py: 以resnet50做为backbone进行训练
   ├── train_multi_GPU.py: 针对使用多GPU的用户使用
   ├── predict.py: 简易的预测脚本，使用训练好的权重进行预测测试
   ├── validation.py: 利用训练好的权重验证/测试数据的COCO指标，并生成record_mAP.txt文件
-  ├── transforms.py: 数据预处理（随机水平翻转图像以及bboxes、将PIL图像转为Tensor）
-  └── compute_receptive_field.py: 计算VGG16用于特征提取部分的感受野(不包括最后一个maxpool层，228)
+  └── transforms.py: 数据预处理（随机水平翻转图像以及bboxes、将PIL图像转为Tensor）
 ```
 
-## 预训练权重下载地址（下载后放入backbone文件夹中）：
-* VGG16 https://download.pytorch.org/models/vgg16-397923af.pth
-* 注意，下载的预训练权重记得要重命名，比如在train.py中读取的是```vgg16.pth```文件，
-  不是```vgg16-397923af.pth```
+## 预训练权重下载地址（下载后放入项目根目录）：
+* Resnet50 https://download.pytorch.org/models/resnet50-19c8e357.pth
+* 注意，下载的预训练权重记得要重命名，比如在train.py中读取的是`resnet50.pth`文件，
+  不是`resnet50-19c8e357.pth`
  
  
 ## 数据集，本例程使用的是COCO2017数据集
 * COCO官网地址：https://cocodataset.org/
 * 对数据集不了解的可以看下我写的博文：https://blog.csdn.net/qq_37541097/article/details/113247318
 * 这里以下载coco2017数据集为例，主要下载三个文件：
-    * ```2017 Train images [118K/18GB]```：训练过程中使用到的所有图像文件
-    * ```2017 Val images [5K/1GB]```：验证过程中使用到的所有图像文件
-    * ```2017 Train/Val annotations [241MB]```：对应训练集和验证集的标注json文件
-* 都解压到```coco2017```文件夹下，可得到如下文件结构：
+    * `2017 Train images [118K/18GB]`：训练过程中使用到的所有图像文件
+    * `2017 Val images [5K/1GB]`：验证过程中使用到的所有图像文件
+    * `2017 Train/Val annotations [241MB]`：对应训练集和验证集的标注json文件
+* 都解压到`coco2017`文件夹下，可得到如下文件结构：
 ```
 ├── coco2017: 数据集根目录
      ├── train2017: 所有训练图像文件夹(118287张)
@@ -56,35 +55,36 @@
 * 确保提前准备好数据集
 * 确保提前下载好对应预训练模型权重
 * 若要使用单GPU训练直接使用train.py训练脚本
-* 若要使用多GPU训练，使用```python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py```指令,```nproc_per_node```参数为使用GPU数量
-* 如果想指定使用哪些GPU设备可在指令前加上```CUDA_VISIBLE_DEVICES=0,3```(例如我只要使用设备中的第1块和第4块GPU设备)
-* ```CUDA_VISIBLE_DEVICES=0,3 python -m torch.distributed.launch --nproc_per_node=2 --use_env train_multi_GPU.py```
+* 若要使用多GPU训练，使用`torchrun --nproc_per_node=8 train_multi_GPU.py`指令,`nproc_per_node`参数为使用GPU数量
+* 如果想指定使用哪些GPU设备可在指令前加上`CUDA_VISIBLE_DEVICES=0,3`(例如我只要使用设备中的第1块和第4块GPU设备)
+* `CUDA_VISIBLE_DEVICES=0,3 torchrun --nproc_per_node=2 train_multi_GPU.py`
 
 ## 注意事项
-* 在使用训练脚本时，注意要将'--data-path'设置为自己存放'coco2017'文件夹所在的**根目录**
-* 在使用预测脚本时，要将'train_weights'设置为你自己生成的权重路径。
-* 使用validation文件时，注意确保你的验证集或者测试集中必须包含每个类别的目标，并且使用时只需要修改'--num-classes'、'--data-path'和'--weights'即可，其他代码尽量不要改动
+* 在使用训练脚本时，注意要将`--data-path`设置为自己存放`coco2017`文件夹所在的**根目录**
+* 训练过程中保存的`results.txt`是每个epoch在验证集上的COCO指标，前12个值是COCO指标，后面两个值是训练平均损失以及学习率
+* 在使用预测脚本时，要将`weights_path`设置为你自己生成的权重路径。
+* 使用validation文件时，注意确保你的验证集或者测试集中必须包含每个类别的目标，并且使用时只需要修改`--num-classes`、`--data-path`和`--weights-path`即可，其他代码尽量不要改动
 
-## 本项目训练得到的权重(Faster R-CNN + VGG16)
-* 链接: https://pan.baidu.com/s/1fz_9raY6gGLNuAO2_uNp9Q  密码: 7l3v
+## 本项目训练得到的权重(Faster R-CNN + Resnet50)
+* 链接: https://pan.baidu.com/s/1iF-Yl_9TkFFeAy-JysfGSw  密码: d2d8
 * COCO2017验证集mAP：
 ```
- Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.233
- Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.415
- Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.233
- Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.104
- Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.262
- Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.323
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.216
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.319
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.327
- Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.145
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.361
- Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.463
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.277
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.453
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.290
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.126
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.308
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.378
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.243
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.358
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.366
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.169
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.402
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.512
 ```
 
 ## 如果对Faster RCNN原理不是很理解可参考我的bilibili
 * https://b23.tv/sXcBSP
 
 ## Faster RCNN框架图
-![Faster R-CNN](https://github.com/WZMIAOMIAO/deep-learning-for-image-processing/raw/master/pytorch_object_detection/faster_rcnn/fasterRCNN.png) 
\ No newline at end of file
+![Faster R-CNN](https://github.com/WZMIAOMIAO/deep-learning-for-image-processing/raw/master/pytorch_object_detection/faster_rcnn/fasterRCNN.png) 
diff --git a/pytorch_object_detection/train_coco_dataset/backbone/__init__.py b/pytorch_object_detection/train_coco_dataset/backbone/__init__.py
index f7559da86..292a703ac 100644
--- a/pytorch_object_detection/train_coco_dataset/backbone/__init__.py
+++ b/pytorch_object_detection/train_coco_dataset/backbone/__init__.py
@@ -1,3 +1,5 @@
 from .resnet50_fpn_model import resnet50_fpn_backbone
 from .mobilenetv2_model import MobileNetV2
 from .vgg_model import vgg
+from .resnet import *
+from .feature_pyramid_network import BackboneWithFPN, LastLevelMaxPool
diff --git a/pytorch_object_detection/train_coco_dataset/backbone/feature_pyramid_network.py b/pytorch_object_detection/train_coco_dataset/backbone/feature_pyramid_network.py
index 79739f219..fc2fc757f 100644
--- a/pytorch_object_detection/train_coco_dataset/backbone/feature_pyramid_network.py
+++ b/pytorch_object_detection/train_coco_dataset/backbone/feature_pyramid_network.py
@@ -8,6 +8,111 @@
 from torch.jit.annotations import Tuple, List, Dict
 
 
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    Arguments:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+    """
+    __annotations__ = {
+        "return_layers": Dict[str, str],
+    }
+
+    def __init__(self, model, return_layers):
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+
+        # 遍历模型子模块按顺序存入有序字典
+        # 只保存layer4及其之前的结构，舍去之后不用的结构
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        # 依次遍历模型的所有子模块，并进行正向传播，
+        # 收集layer1, layer2, layer3, layer4的输出
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+
+
+class BackboneWithFPN(nn.Module):
+    """
+    Adds a FPN on top of a model.
+    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
+    extract a submodel that returns the feature maps specified in return_layers.
+    The same limitations of IntermediatLayerGetter apply here.
+    Arguments:
+        backbone (nn.Module)
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+        in_channels_list (List[int]): number of channels for each feature map
+            that is returned, in the order they are present in the OrderedDict
+        out_channels (int): number of channels in the FPN.
+        extra_blocks: ExtraFPNBlock
+    Attributes:
+        out_channels (int): the number of channels in the FPN
+    """
+
+    def __init__(self,
+                 backbone: nn.Module,
+                 return_layers=None,
+                 in_channels_list=None,
+                 out_channels=256,
+                 extra_blocks=None,
+                 re_getter=True):
+        super().__init__()
+
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        if re_getter:
+            assert return_layers is not None
+            self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        else:
+            self.body = backbone
+
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list=in_channels_list,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+        )
+
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        x = self.body(x)
+        x = self.fpn(x)
+        return x
+
+
 class FeaturePyramidNetwork(nn.Module):
     """
     Module that adds a FPN from on top of a set of feature maps. This is based on
@@ -27,7 +132,7 @@ class FeaturePyramidNetwork(nn.Module):
     """
 
     def __init__(self, in_channels_list, out_channels, extra_blocks=None):
-        super(FeaturePyramidNetwork, self).__init__()
+        super().__init__()
         # 用来调整resnet特征矩阵(layer1,2,3,4)的channel（kernel_size=1）
         self.inner_blocks = nn.ModuleList()
         # 对调整后的特征矩阵使用3x3的卷积核来得到对应的预测特征矩阵
@@ -48,8 +153,7 @@ def __init__(self, in_channels_list, out_channels, extra_blocks=None):
 
         self.extra_blocks = extra_blocks
 
-    def get_result_from_inner_blocks(self, x, idx):
-        # type: (Tensor, int) -> Tensor
+    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
         """
         This is equivalent to self.inner_blocks[idx](x),
         but torchscript doesn't support this yet
@@ -65,8 +169,7 @@ def get_result_from_inner_blocks(self, x, idx):
             i += 1
         return out
 
-    def get_result_from_layer_blocks(self, x, idx):
-        # type: (Tensor, int) -> Tensor
+    def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
         """
         This is equivalent to self.layer_blocks[idx](x),
         but torchscript doesn't support this yet
@@ -82,8 +185,7 @@ def get_result_from_layer_blocks(self, x, idx):
             i += 1
         return out
 
-    def forward(self, x):
-        # type: (Dict[str, Tensor]) -> Dict[str, Tensor]
+    def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """
         Computes the FPN for a set of feature maps.
         Arguments:
@@ -127,8 +229,7 @@ class LastLevelMaxPool(torch.nn.Module):
     Applies a max_pool2d on top of the last feature map
     """
 
-    def forward(self, x, y, names):
-        # type: (List[Tensor], List[Tensor], List[str]) -> Tuple[List[Tensor], List[str]]
+    def forward(self, x: List[Tensor], y: List[Tensor], names: List[str]) -> Tuple[List[Tensor], List[str]]:
         names.append("pool")
         x.append(F.max_pool2d(x[-1], 1, 2, 0))
         return x, names
diff --git a/pytorch_object_detection/train_coco_dataset/backbone/resnet.py b/pytorch_object_detection/train_coco_dataset/backbone/resnet.py
new file mode 100644
index 000000000..c2aa086fe
--- /dev/null
+++ b/pytorch_object_detection/train_coco_dataset/backbone/resnet.py
@@ -0,0 +1,198 @@
+import torch.nn as nn
+import torch
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_channel, out_channel, stride=1, downsample=None, **kwargs):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
+                               kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channel)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
+                               kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channel)
+        self.downsample = downsample
+
+    def forward(self, x):
+        identity = x
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    """
+    注意：原论文中，在虚线残差结构的主分支上，第一个1x1卷积层的步距是2，第二个3x3卷积层步距是1。
+    但在pytorch官方实现过程中是第一个1x1卷积层的步距是1，第二个3x3卷积层步距是2，
+    这么做的好处是能够在top1上提升大概0.5%的准确率。
+    可参考Resnet v1.5 https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch
+    """
+    expansion = 4
+
+    def __init__(self, in_channel, out_channel, stride=1, downsample=None,
+                 groups=1, width_per_group=64):
+        super(Bottleneck, self).__init__()
+
+        width = int(out_channel * (width_per_group / 64.)) * groups
+
+        self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=width,
+                               kernel_size=1, stride=1, bias=False)  # squeeze channels
+        self.bn1 = nn.BatchNorm2d(width)
+        # -----------------------------------------
+        self.conv2 = nn.Conv2d(in_channels=width, out_channels=width, groups=groups,
+                               kernel_size=3, stride=stride, bias=False, padding=1)
+        self.bn2 = nn.BatchNorm2d(width)
+        # -----------------------------------------
+        self.conv3 = nn.Conv2d(in_channels=width, out_channels=out_channel*self.expansion,
+                               kernel_size=1, stride=1, bias=False)  # unsqueeze channels
+        self.bn3 = nn.BatchNorm2d(out_channel*self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    def forward(self, x):
+        identity = x
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 blocks_num,
+                 num_classes=1000,
+                 include_top=True,
+                 groups=1,
+                 width_per_group=64):
+        super(ResNet, self).__init__()
+        self.include_top = include_top
+        self.in_channel = 64
+
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
+                               padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.in_channel)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, blocks_num[0])
+        self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
+        if self.include_top:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))  # output size = (1, 1)
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+
+    def _make_layer(self, block, channel, block_num, stride=1):
+        downsample = None
+        if stride != 1 or self.in_channel != channel * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(channel * block.expansion))
+
+        layers = []
+        layers.append(block(self.in_channel,
+                            channel,
+                            downsample=downsample,
+                            stride=stride,
+                            groups=self.groups,
+                            width_per_group=self.width_per_group))
+        self.in_channel = channel * block.expansion
+
+        for _ in range(1, block_num):
+            layers.append(block(self.in_channel,
+                                channel,
+                                groups=self.groups,
+                                width_per_group=self.width_per_group))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.include_top:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+
+        return x
+
+
+def resnet34(num_classes=1000, include_top=True):
+    # https://download.pytorch.org/models/resnet34-333f7ec4.pth
+    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes, include_top=include_top)
+
+
+def resnet50(num_classes=1000, include_top=True):
+    # https://download.pytorch.org/models/resnet50-19c8e357.pth
+    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, include_top=include_top)
+
+
+def resnet101(num_classes=1000, include_top=True):
+    # https://download.pytorch.org/models/resnet101-5d3b4d8f.pth
+    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, include_top=include_top)
+
+
+def resnext50_32x4d(num_classes=1000, include_top=True):
+    # https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth
+    groups = 32
+    width_per_group = 4
+    return ResNet(Bottleneck, [3, 4, 6, 3],
+                  num_classes=num_classes,
+                  include_top=include_top,
+                  groups=groups,
+                  width_per_group=width_per_group)
+
+
+def resnext101_32x8d(num_classes=1000, include_top=True):
+    # https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth
+    groups = 32
+    width_per_group = 8
+    return ResNet(Bottleneck, [3, 4, 23, 3],
+                  num_classes=num_classes,
+                  include_top=include_top,
+                  groups=groups,
+                  width_per_group=width_per_group)
diff --git a/pytorch_object_detection/train_coco_dataset/backbone/resnet50_fpn_model.py b/pytorch_object_detection/train_coco_dataset/backbone/resnet50_fpn_model.py
index 8c796cfac..b15930765 100644
--- a/pytorch_object_detection/train_coco_dataset/backbone/resnet50_fpn_model.py
+++ b/pytorch_object_detection/train_coco_dataset/backbone/resnet50_fpn_model.py
@@ -1,19 +1,17 @@
 import os
-from collections import OrderedDict
 
 import torch
 import torch.nn as nn
-from torch.jit.annotations import List, Dict
 from torchvision.ops.misc import FrozenBatchNorm2d
 
-from .feature_pyramid_network import FeaturePyramidNetwork, LastLevelMaxPool
+from .feature_pyramid_network import BackboneWithFPN, LastLevelMaxPool
 
 
 class Bottleneck(nn.Module):
     expansion = 4
 
     def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None):
-        super(Bottleneck, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
 
@@ -56,7 +54,7 @@ def forward(self, x):
 class ResNet(nn.Module):
 
     def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None):
-        super(ResNet, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         self._norm_layer = norm_layer
@@ -136,100 +134,6 @@ def overwrite_eps(model, eps):
             module.eps = eps
 
 
-class IntermediateLayerGetter(nn.ModuleDict):
-    """
-    Module wrapper that returns intermediate layers from a model
-    It has a strong assumption that the modules have been registered
-    into the model in the same order as they are used.
-    This means that one should **not** reuse the same nn.Module
-    twice in the forward if you want this to work.
-    Additionally, it is only able to query submodules that are directly
-    assigned to the model. So if `model` is passed, `model.feature1` can
-    be returned, but not `model.feature1.layer2`.
-    Arguments:
-        model (nn.Module): model on which we will extract the features
-        return_layers (Dict[name, new_name]): a dict containing the names
-            of the modules for which the activations will be returned as
-            the key of the dict, and the value of the dict is the name
-            of the returned activation (which the user can specify).
-    """
-    __annotations__ = {
-        "return_layers": Dict[str, str],
-    }
-
-    def __init__(self, model, return_layers):
-        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
-            raise ValueError("return_layers are not present in model")
-
-        orig_return_layers = return_layers
-        return_layers = {str(k): str(v) for k, v in return_layers.items()}
-        layers = OrderedDict()
-
-        # 遍历模型子模块按顺序存入有序字典
-        # 只保存layer4及其之前的结构，舍去之后不用的结构
-        for name, module in model.named_children():
-            layers[name] = module
-            if name in return_layers:
-                del return_layers[name]
-            if not return_layers:
-                break
-
-        super(IntermediateLayerGetter, self).__init__(layers)
-        self.return_layers = orig_return_layers
-
-    def forward(self, x):
-        out = OrderedDict()
-        # 依次遍历模型的所有子模块，并进行正向传播，
-        # 收集layer1, layer2, layer3, layer4的输出
-        for name, module in self.items():
-            x = module(x)
-            if name in self.return_layers:
-                out_name = self.return_layers[name]
-                out[out_name] = x
-        return out
-
-
-class BackboneWithFPN(nn.Module):
-    """
-    Adds a FPN on top of a model.
-    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
-    extract a submodel that returns the feature maps specified in return_layers.
-    The same limitations of IntermediatLayerGetter apply here.
-    Arguments:
-        backbone (nn.Module)
-        return_layers (Dict[name, new_name]): a dict containing the names
-            of the modules for which the activations will be returned as
-            the key of the dict, and the value of the dict is the name
-            of the returned activation (which the user can specify).
-        in_channels_list (List[int]): number of channels for each feature map
-            that is returned, in the order they are present in the OrderedDict
-        out_channels (int): number of channels in the FPN.
-        extra_blocks: ExtraFPNBlock
-    Attributes:
-        out_channels (int): the number of channels in the FPN
-    """
-
-    def __init__(self, backbone, return_layers, in_channels_list, out_channels, extra_blocks=None):
-        super(BackboneWithFPN, self).__init__()
-
-        if extra_blocks is None:
-            extra_blocks = LastLevelMaxPool()
-
-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
-        self.fpn = FeaturePyramidNetwork(
-            in_channels_list=in_channels_list,
-            out_channels=out_channels,
-            extra_blocks=extra_blocks,
-        )
-
-        self.out_channels = out_channels
-
-    def forward(self, x):
-        x = self.body(x)
-        x = self.fpn(x)
-        return x
-
-
 def resnet50_fpn_backbone(pretrain_path="",
                           norm_layer=FrozenBatchNorm2d,  # FrozenBatchNorm2d的功能与BatchNorm2d类似，但参数无法更新
                           trainable_layers=3,
diff --git a/pytorch_object_detection/train_coco_dataset/change_backbone_with_fpn.py b/pytorch_object_detection/train_coco_dataset/change_backbone_with_fpn.py
new file mode 100644
index 000000000..36b2fa554
--- /dev/null
+++ b/pytorch_object_detection/train_coco_dataset/change_backbone_with_fpn.py
@@ -0,0 +1,257 @@
+import os
+import datetime
+
+import torch
+
+import transforms
+from network_files import FasterRCNN, AnchorsGenerator
+from my_dataset import CocoDetection
+from train_utils import GroupedBatchSampler, create_aspect_ratio_groups
+from train_utils import train_eval_utils as utils
+from backbone import BackboneWithFPN, LastLevelMaxPool
+
+
+def create_model(num_classes):
+    import torchvision
+    from torchvision.models.feature_extraction import create_feature_extractor
+
+    # --- mobilenet_v3_large fpn backbone --- #
+    backbone = torchvision.models.mobilenet_v3_large(pretrained=True)
+    # print(backbone)
+    return_layers = {"features.6": "0",   # stride 8
+                     "features.12": "1",  # stride 16
+                     "features.16": "2"}  # stride 32
+    # 提供给fpn的每个特征层channel
+    in_channels_list = [40, 112, 960]
+    new_backbone = create_feature_extractor(backbone, return_layers)
+    # img = torch.randn(1, 3, 224, 224)
+    # outputs = new_backbone(img)
+    # [print(f"{k} shape: {v.shape}") for k, v in outputs.items()]
+
+    # --- efficientnet_b0 fpn backbone --- #
+    # backbone = torchvision.models.efficientnet_b0(pretrained=True)
+    # # print(backbone)
+    # return_layers = {"features.3": "0",  # stride 8
+    #                  "features.4": "1",  # stride 16
+    #                  "features.8": "2"}  # stride 32
+    # # 提供给fpn的每个特征层channel
+    # in_channels_list = [40, 80, 1280]
+    # new_backbone = create_feature_extractor(backbone, return_layers)
+    # # img = torch.randn(1, 3, 224, 224)
+    # # outputs = new_backbone(img)
+    # # [print(f"{k} shape: {v.shape}") for k, v in outputs.items()]
+
+    backbone_with_fpn = BackboneWithFPN(new_backbone,
+                                        return_layers=return_layers,
+                                        in_channels_list=in_channels_list,
+                                        out_channels=256,
+                                        extra_blocks=LastLevelMaxPool(),
+                                        re_getter=False)
+
+    anchor_sizes = ((64,), (128,), (256,), (512,))
+    aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+    anchor_generator = AnchorsGenerator(sizes=anchor_sizes,
+                                        aspect_ratios=aspect_ratios)
+
+    roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0', '1', '2'],  # 在哪些特征层上进行RoIAlign pooling
+                                                    output_size=[7, 7],  # RoIAlign pooling输出特征矩阵尺寸
+                                                    sampling_ratio=2)  # 采样率
+
+    model = FasterRCNN(backbone=backbone_with_fpn,
+                       num_classes=num_classes,
+                       rpn_anchor_generator=anchor_generator,
+                       box_roi_pool=roi_pooler)
+
+    return model
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    print("Using {} device training.".format(device.type))
+
+    # 用来保存coco_info的文件
+    results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
+
+    data_transform = {
+        "train": transforms.Compose([transforms.ToTensor(),
+                                     transforms.RandomHorizontalFlip(0.5)]),
+        "val": transforms.Compose([transforms.ToTensor()])
+    }
+
+    COCO_root = args.data_path
+
+    # load train data set
+    # coco2017 -> annotations -> instances_train2017.json
+    train_dataset = CocoDetection(COCO_root, "train", data_transform["train"])
+    train_sampler = None
+
+    # 是否按图片相似高宽比采样图片组成batch
+    # 使用的话能够减小训练时所需GPU显存，默认使用
+    if args.aspect_ratio_group_factor >= 0:
+        train_sampler = torch.utils.data.RandomSampler(train_dataset)
+        # 统计所有图像高宽比例在bins区间中的位置索引
+        group_ids = create_aspect_ratio_groups(train_dataset, k=args.aspect_ratio_group_factor)
+        # 每个batch图片从同一高宽比例区间中取
+        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
+
+    # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
+    batch_size = args.batch_size
+    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    print('Using %g dataloader workers' % nw)
+    if train_sampler:
+        # 如果按照图片高宽比采样图片，dataloader中需要使用batch_sampler
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_sampler=train_batch_sampler,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+    else:
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_size=batch_size,
+                                                        shuffle=True,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+
+    # load validation data set
+    # coco2017 -> annotations -> instances_val2017.json
+    val_dataset = CocoDetection(COCO_root, "val", data_transform["val"])
+    val_data_set_loader = torch.utils.data.DataLoader(val_dataset,
+                                                      batch_size=1,
+                                                      shuffle=False,
+                                                      pin_memory=True,
+                                                      num_workers=nw,
+                                                      collate_fn=val_dataset.collate_fn)
+
+    # create model num_classes equal background + classes
+    model = create_model(num_classes=args.num_classes + 1)
+    # print(model)
+
+    model.to(device)
+
+    # define optimizer
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.SGD(params,
+                                lr=args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    # learning rate scheduler
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
+                                                        milestones=args.lr_steps,
+                                                        gamma=args.lr_gamma)
+
+    # 如果指定了上次训练保存的权重文件地址，则接着上次结果接着训练
+    if args.resume != "":
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp and "scaler" in checkpoint:
+            scaler.load_state_dict(checkpoint["scaler"])
+        print("the training process from epoch{}...".format(args.start_epoch))
+
+    train_loss = []
+    learning_rate = []
+    val_map = []
+
+    for epoch in range(args.start_epoch, args.epochs):
+        # train for one epoch, printing every 10 iterations
+        mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
+                                              device=device, epoch=epoch,
+                                              print_freq=50, warmup=True,
+                                              scaler=scaler)
+        train_loss.append(mean_loss.item())
+        learning_rate.append(lr)
+
+        # update the learning rate
+        lr_scheduler.step()
+
+        # evaluate on the test dataset
+        coco_info = utils.evaluate(model, val_data_set_loader, device=device)
+
+        # write into txt
+        with open(results_file, "a") as f:
+            # 写入的数据包括coco指标还有loss和learning rate
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
+            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
+            f.write(txt + "\n")
+
+        val_map.append(coco_info[1])  # pascal mAP
+
+        # save weights
+        save_files = {
+            'model': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'lr_scheduler': lr_scheduler.state_dict(),
+            'epoch': epoch}
+        if args.amp:
+            save_files["scaler"] = scaler.state_dict()
+        torch.save(save_files, "./save_weights/model-{}.pth".format(epoch))
+
+    # plot loss and lr curve
+    if len(train_loss) != 0 and len(learning_rate) != 0:
+        from plot_curve import plot_loss_and_lr
+        plot_loss_and_lr(train_loss, learning_rate)
+
+    # plot mAP curve
+    if len(val_map) != 0:
+        from plot_curve import plot_map
+        plot_map(val_map)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 训练设备类型
+    parser.add_argument('--device', default='cuda:0', help='device')
+    # 训练数据集的根目录
+    parser.add_argument('--data-path', default='/data/coco2017', help='dataset')
+    # 检测目标类别数(不包含背景)
+    parser.add_argument('--num-classes', default=90, type=int, help='num_classes')
+    # 文件保存地址
+    parser.add_argument('--output-dir', default='./save_weights', help='path where to save')
+    # 若需要接着上次训练，则指定上次训练保存权重文件地址
+    parser.add_argument('--resume', default='', type=str, help='resume from checkpoint')
+    # 指定接着从哪个epoch数开始训练
+    parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
+    # 训练的总epoch数
+    parser.add_argument('--epochs', default=26, type=int, metavar='N',
+                        help='number of total epochs to run')
+    # 学习率
+    parser.add_argument('--lr', default=0.005, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                             'on 8 gpus and 2 images_per_gpu')
+    # SGD的momentum参数
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    # SGD的weight_decay参数
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int,
+                        help='decrease lr every step-size epochs')
+    # 针对torch.optim.lr_scheduler.MultiStepLR的参数
+    parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
+    # 训练的batch size
+    parser.add_argument('--batch_size', default=4, type=int, metavar='N',
+                        help='batch size when training.')
+    parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
+    # 是否使用混合精度训练(需要GPU支持混合精度)
+    parser.add_argument("--amp", default=False, help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+    print(args)
+
+    # 检查保存权重文件夹是否存在，不存在则创建
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    main(args)
diff --git a/pytorch_object_detection/train_coco_dataset/coco80_indices.json b/pytorch_object_detection/train_coco_dataset/coco80_indices.json
deleted file mode 100644
index e317e9299..000000000
--- a/pytorch_object_detection/train_coco_dataset/coco80_indices.json
+++ /dev/null
@@ -1,82 +0,0 @@
-{
-    "1": "person",
-    "2": "bicycle",
-    "3": "car",
-    "4": "motorcycle",
-    "5": "airplane",
-    "6": "bus",
-    "7": "train",
-    "8": "truck",
-    "9": "boat",
-    "10": "traffic light",
-    "11": "fire hydrant",
-    "12": "stop sign",
-    "13": "parking meter",
-    "14": "bench",
-    "15": "bird",
-    "16": "cat",
-    "17": "dog",
-    "18": "horse",
-    "19": "sheep",
-    "20": "cow",
-    "21": "elephant",
-    "22": "bear",
-    "23": "zebra",
-    "24": "giraffe",
-    "25": "backpack",
-    "26": "umbrella",
-    "27": "handbag",
-    "28": "tie",
-    "29": "suitcase",
-    "30": "frisbee",
-    "31": "skis",
-    "32": "snowboard",
-    "33": "sports ball",
-    "34": "kite",
-    "35": "baseball bat",
-    "36": "baseball glove",
-    "37": "skateboard",
-    "38": "surfboard",
-    "39": "tennis racket",
-    "40": "bottle",
-    "41": "wine glass",
-    "42": "cup",
-    "43": "fork",
-    "44": "knife",
-    "45": "spoon",
-    "46": "bowl",
-    "47": "banana",
-    "48": "apple",
-    "49": "sandwich",
-    "50": "orange",
-    "51": "broccoli",
-    "52": "carrot",
-    "53": "hot dog",
-    "54": "pizza",
-    "55": "donut",
-    "56": "cake",
-    "57": "chair",
-    "58": "couch",
-    "59": "potted plant",
-    "60": "bed",
-    "61": "dining table",
-    "62": "toilet",
-    "63": "tv",
-    "64": "laptop",
-    "65": "mouse",
-    "66": "remote",
-    "67": "keyboard",
-    "68": "cell phone",
-    "69": "microwave",
-    "70": "oven",
-    "71": "toaster",
-    "72": "sink",
-    "73": "refrigerator",
-    "74": "book",
-    "75": "clock",
-    "76": "vase",
-    "77": "scissors",
-    "78": "teddy bear",
-    "79": "hair drier",
-    "80": "toothbrush"
-}
\ No newline at end of file
diff --git a/pytorch_object_detection/train_coco_dataset/coco91_indices.json b/pytorch_object_detection/train_coco_dataset/coco91_indices.json
new file mode 100644
index 000000000..decbe58ce
--- /dev/null
+++ b/pytorch_object_detection/train_coco_dataset/coco91_indices.json
@@ -0,0 +1,92 @@
+{
+    "1": "person",
+    "2": "bicycle",
+    "3": "car",
+    "4": "motorcycle",
+    "5": "airplane",
+    "6": "bus",
+    "7": "train",
+    "8": "truck",
+    "9": "boat",
+    "10": "traffic light",
+    "11": "fire hydrant",
+    "12": "N/A",
+    "13": "stop sign",
+    "14": "parking meter",
+    "15": "bench",
+    "16": "bird",
+    "17": "cat",
+    "18": "dog",
+    "19": "horse",
+    "20": "sheep",
+    "21": "cow",
+    "22": "elephant",
+    "23": "bear",
+    "24": "zebra",
+    "25": "giraffe",
+    "26": "N/A",
+    "27": "backpack",
+    "28": "umbrella",
+    "29": "N/A",
+    "30": "N/A",
+    "31": "handbag",
+    "32": "tie",
+    "33": "suitcase",
+    "34": "frisbee",
+    "35": "skis",
+    "36": "snowboard",
+    "37": "sports ball",
+    "38": "kite",
+    "39": "baseball bat",
+    "40": "baseball glove",
+    "41": "skateboard",
+    "42": "surfboard",
+    "43": "tennis racket",
+    "44": "bottle",
+    "45": "N/A",
+    "46": "wine glass",
+    "47": "cup",
+    "48": "fork",
+    "49": "knife",
+    "50": "spoon",
+    "51": "bowl",
+    "52": "banana",
+    "53": "apple",
+    "54": "sandwich",
+    "55": "orange",
+    "56": "broccoli",
+    "57": "carrot",
+    "58": "hot dog",
+    "59": "pizza",
+    "60": "donut",
+    "61": "cake",
+    "62": "chair",
+    "63": "couch",
+    "64": "potted plant",
+    "65": "bed",
+    "66": "N/A",
+    "67": "dining table",
+    "68": "N/A",
+    "69": "N/A",
+    "70": "toilet",
+    "71": "N/A",
+    "72": "tv",
+    "73": "laptop",
+    "74": "mouse",
+    "75": "remote",
+    "76": "keyboard",
+    "77": "cell phone",
+    "78": "microwave",
+    "79": "oven",
+    "80": "toaster",
+    "81": "sink",
+    "82": "refrigerator",
+    "83": "N/A",
+    "84": "book",
+    "85": "clock",
+    "86": "vase",
+    "87": "scissors",
+    "88": "teddy bear",
+    "89": "hair drier",
+    "90": "toothbrush"
+}
\ No newline at end of file
diff --git a/pytorch_object_detection/train_coco_dataset/coco91_to_80.json b/pytorch_object_detection/train_coco_dataset/coco91_to_80.json
deleted file mode 100644
index fd190538e..000000000
--- a/pytorch_object_detection/train_coco_dataset/coco91_to_80.json
+++ /dev/null
@@ -1,82 +0,0 @@
-{
-    "1": 1,
-    "2": 2,
-    "3": 3,
-    "4": 4,
-    "5": 5,
-    "6": 6,
-    "7": 7,
-    "8": 8,
-    "9": 9,
-    "10": 10,
-    "11": 11,
-    "13": 12,
-    "14": 13,
-    "15": 14,
-    "16": 15,
-    "17": 16,
-    "18": 17,
-    "19": 18,
-    "20": 19,
-    "21": 20,
-    "22": 21,
-    "23": 22,
-    "24": 23,
-    "25": 24,
-    "27": 25,
-    "28": 26,
-    "31": 27,
-    "32": 28,
-    "33": 29,
-    "34": 30,
-    "35": 31,
-    "36": 32,
-    "37": 33,
-    "38": 34,
-    "39": 35,
-    "40": 36,
-    "41": 37,
-    "42": 38,
-    "43": 39,
-    "44": 40,
-    "46": 41,
-    "47": 42,
-    "48": 43,
-    "49": 44,
-    "50": 45,
-    "51": 46,
-    "52": 47,
-    "53": 48,
-    "54": 49,
-    "55": 50,
-    "56": 51,
-    "57": 52,
-    "58": 53,
-    "59": 54,
-    "60": 55,
-    "61": 56,
-    "62": 57,
-    "63": 58,
-    "64": 59,
-    "65": 60,
-    "67": 61,
-    "70": 62,
-    "72": 63,
-    "73": 64,
-    "74": 65,
-    "75": 66,
-    "76": 67,
-    "77": 68,
-    "78": 69,
-    "79": 70,
-    "80": 71,
-    "81": 72,
-    "82": 73,
-    "84": 74,
-    "85": 75,
-    "86": 76,
-    "87": 77,
-    "88": 78,
-    "89": 79,
-    "90": 80
-}
\ No newline at end of file
diff --git a/pytorch_object_detection/train_coco_dataset/draw_box_utils.py b/pytorch_object_detection/train_coco_dataset/draw_box_utils.py
index 25d86f4fa..835d7f7c1 100644
--- a/pytorch_object_detection/train_coco_dataset/draw_box_utils.py
+++ b/pytorch_object_detection/train_coco_dataset/draw_box_utils.py
@@ -1,6 +1,7 @@
-import collections
+from PIL.Image import Image, fromarray
 import PIL.ImageDraw as ImageDraw
 import PIL.ImageFont as ImageFont
+from PIL import ImageColor
 import numpy as np
 
 STANDARD_COLORS = [
@@ -30,68 +31,123 @@
 ]
 
 
-def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):
-    for i in range(boxes.shape[0]):
-        if scores[i] > thresh:
-            box = tuple(boxes[i].tolist())  # numpy -> list -> tuple
-            if classes[i] in category_index.keys():
-                class_name = category_index[classes[i]]
-            elif str(classes[i]) in category_index.keys():
-                class_name = category_index[str(classes[i])]
-            else:
-                class_name = 'N/A'
-            display_str = str(class_name)
-            display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))
-            box_to_display_str_map[box].append(display_str)
-            box_to_color_map[box] = STANDARD_COLORS[
-                classes[i] % len(STANDARD_COLORS)]
-        else:
-            break  # 网络输出概率已经排序过，当遇到一个不满足后面的肯定不满足
-
-
-def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):
+def draw_text(draw,
+              box: list,
+              cls: int,
+              score: float,
+              category_index: dict,
+              color: str,
+              font: str = 'arial.ttf',
+              font_size: int = 24):
+    """
+    将目标边界框和类别信息绘制到图片上
+    """
     try:
-        font = ImageFont.truetype('arial.ttf', 24)
+        font = ImageFont.truetype(font, font_size)
     except IOError:
         font = ImageFont.load_default()
 
+    left, top, right, bottom = box
     # If the total height of the display strings added to the top of the bounding
     # box exceeds the top of the image, stack the strings below the bounding box
     # instead of above.
-    display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]
+    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
+    display_str_heights = [font.getsize(ds)[1] for ds in display_str]
     # Each display_str has a top and bottom margin of 0.05x.
-    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
+    display_str_height = (1 + 2 * 0.05) * max(display_str_heights)
 
-    if top > total_display_str_height:
+    if top > display_str_height:
+        text_top = top - display_str_height
         text_bottom = top
     else:
-        text_bottom = bottom + total_display_str_height
-    # Reverse list and print from bottom to top.
-    for display_str in box_to_display_str_map[box][::-1]:
-        text_width, text_height = font.getsize(display_str)
-        margin = np.ceil(0.05 * text_height)
-        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
-                        (left + text_width, text_bottom)], fill=color)
-        draw.text((left + margin, text_bottom - text_height - margin),
-                  display_str,
+        text_top = bottom
+        text_bottom = bottom + display_str_height
+
+    for ds in display_str:
+        text_width, text_height = font.getsize(ds)
+        margin = np.ceil(0.05 * text_width)
+        draw.rectangle([(left, text_top),
+                        (left + text_width + 2 * margin, text_bottom)], fill=color)
+        draw.text((left + margin, text_top),
+                  ds,
                   fill='black',
                   font=font)
-        text_bottom -= text_height - 2 * margin
+        left += text_width
+
+
+def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
+    np_image = np.array(image)
+    masks = np.where(masks > thresh, True, False)
+
+    # colors = np.array(colors)
+    img_to_draw = np.copy(np_image)
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors):
+        img_to_draw[mask] = color
+
+    out = np_image * (1 - alpha) + img_to_draw * alpha
+    return fromarray(out.astype(np.uint8))
+
+
+def draw_objs(image: Image,
+              boxes: np.ndarray = None,
+              classes: np.ndarray = None,
+              scores: np.ndarray = None,
+              masks: np.ndarray = None,
+              category_index: dict = None,
+              box_thresh: float = 0.1,
+              mask_thresh: float = 0.5,
+              line_thickness: int = 8,
+              font: str = 'arial.ttf',
+              font_size: int = 24,
+              draw_boxes_on_image: bool = True,
+              draw_masks_on_image: bool = False):
+    """
+    将目标边界框信息，类别信息，mask信息绘制在图片上
+    Args:
+        image: 需要绘制的图片
+        boxes: 目标边界框信息
+        classes: 目标类别信息
+        scores: 目标概率信息
+        masks: 目标mask信息
+        category_index: 类别与名称字典
+        box_thresh: 过滤的概率阈值
+        mask_thresh:
+        line_thickness: 边界框宽度
+        font: 字体类型
+        font_size: 字体大小
+        draw_boxes_on_image:
+        draw_masks_on_image:
+
+    Returns:
+
+    """
+
+    # 过滤掉低概率的目标
+    idxs = np.greater(scores, box_thresh)
+    boxes = boxes[idxs]
+    classes = classes[idxs]
+    scores = scores[idxs]
+    if masks is not None:
+        masks = masks[idxs]
+    if len(boxes) == 0:
+        return image
 
+    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
 
-def draw_box(image, boxes, classes, scores, category_index, thresh=0.5, line_thickness=8):
-    box_to_display_str_map = collections.defaultdict(list)
-    box_to_color_map = collections.defaultdict(str)
+    if draw_boxes_on_image:
+        # Draw all boxes onto image.
+        draw = ImageDraw.Draw(image)
+        for box, cls, score, color in zip(boxes, classes, scores, colors):
+            left, top, right, bottom = box
+            # 绘制目标边界框
+            draw.line([(left, top), (left, bottom), (right, bottom),
+                       (right, top), (left, top)], width=line_thickness, fill=color)
+            # 绘制类别和概率信息
+            draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
 
-    filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
+    if draw_masks_on_image and (masks is not None):
+        # Draw all mask onto image.
+        image = draw_masks(image, masks, colors, mask_thresh)
 
-    # Draw all boxes onto image.
-    draw = ImageDraw.Draw(image)
-    im_width, im_height = image.size
-    for box, color in box_to_color_map.items():
-        xmin, ymin, xmax, ymax = box
-        (left, right, top, bottom) = (xmin * 1, xmax * 1,
-                                      ymin * 1, ymax * 1)
-        draw.line([(left, top), (left, bottom), (right, bottom),
-                   (right, top), (left, top)], width=line_thickness, fill=color)
-        draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
+    return image
diff --git a/pytorch_object_detection/train_coco_dataset/my_dataset.py b/pytorch_object_detection/train_coco_dataset/my_dataset.py
index dbacb54d9..31b71cee9 100644
--- a/pytorch_object_detection/train_coco_dataset/my_dataset.py
+++ b/pytorch_object_detection/train_coco_dataset/my_dataset.py
@@ -64,30 +64,24 @@ def __init__(self, root, dataset="train", transforms=None):
         self.transforms = transforms
         self.coco = COCO(self.anno_path)
 
-        if dataset == "train":
-            # 获取coco数据索引与类别名称的关系
-            # 注意在object80中的索引并不是连续的，虽然只有80个类别，但索引还是按照stuff91来排序的
-            coco_classes = dict([(v["id"], v["name"]) for k, v in self.coco.cats.items()])
-
-            # 将stuff91的类别索引重新编排，从1到80
-            coco91to80 = dict([(str(k), idx+1) for idx, (k, _) in enumerate(coco_classes.items())])
-            json_str = json.dumps(coco91to80, indent=4)
-            with open('coco91_to_80.json', 'w') as json_file:
-                json_file.write(json_str)
-
-            # 记录重新编排后的索引以及类别名称关系
-            coco80_info = dict([(str(idx+1), v) for idx, (_, v) in enumerate(coco_classes.items())])
-            json_str = json.dumps(coco80_info, indent=4)
-            with open('coco80_indices.json', 'w') as json_file:
-                json_file.write(json_str)
-        else:
-            # 如果是验证集就直接读取生成好的数据
-            coco91to80_path = 'coco91_to_80.json'
-            assert os.path.exists(coco91to80_path), "file '{}' does not exist.".format(coco91to80_path)
+        # 获取coco数据索引与类别名称的关系
+        # 注意在object80中的索引并不是连续的，虽然只有80个类别，但索引还是按照stuff91来排序的
+        data_classes = dict([(v["id"], v["name"]) for k, v in self.coco.cats.items()])
+        max_index = max(data_classes.keys())  # 90
+        # 将缺失的类别名称设置成N/A
+        coco_classes = {}
+        for k in range(1, max_index + 1):
+            if k in data_classes:
+                coco_classes[k] = data_classes[k]
+            else:
+                coco_classes[k] = "N/A"
 
-            coco91to80 = json.load(open(coco91to80_path, "r"))
+        if dataset == "train":
+            json_str = json.dumps(coco_classes, indent=4)
+            with open("coco91_indices.json", "w") as f:
+                f.write(json_str)
 
-        self.coco91to80 = coco91to80
+        self.coco_classes = coco_classes
 
         ids = list(sorted(self.coco.imgs.keys()))
         if dataset == "train":
@@ -102,34 +96,40 @@ def parse_targets(self,
                       coco_targets: list,
                       w: int = None,
                       h: int = None):
+        assert w > 0
+        assert h > 0
+
         # 只筛选出单个对象的情况
         anno = [obj for obj in coco_targets if obj['iscrowd'] == 0]
 
-        # 进一步检查数据，有的标注信息中可能有w或h为0的情况，这样的数据会导致计算回归loss为nan
-        boxes = []
-        for obj in anno:
-            if obj["bbox"][2] > 0 and obj["bbox"][3] > 0:
-                boxes.append(obj["bbox"])
+        boxes = [obj["bbox"] for obj in anno]
 
         # guard against no boxes via resizing
         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
         # [xmin, ymin, w, h] -> [xmin, ymin, xmax, ymax]
         boxes[:, 2:] += boxes[:, :2]
-        if (w is not None) and (h is not None):
-            boxes[:, 0::2].clamp_(min=0, max=w)
-            boxes[:, 1::2].clamp_(min=0, max=h)
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
 
-        classes = [self.coco91to80[str(obj["category_id"])] for obj in anno]
+        classes = [obj["category_id"] for obj in anno]
         classes = torch.tensor(classes, dtype=torch.int64)
 
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+
+        # 筛选出合法的目标，即x_max>x_min且y_max>y_min
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        area = area[keep]
+        iscrowd = iscrowd[keep]
+
         target = {}
         target["boxes"] = boxes
         target["labels"] = classes
         target["image_id"] = torch.tensor([img_id])
 
         # for conversion to coco api
-        area = torch.tensor([obj["area"] for obj in anno])
-        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
         target["area"] = area
         target["iscrowd"] = iscrowd
 
diff --git a/pytorch_object_detection/train_coco_dataset/network_files/boxes.py b/pytorch_object_detection/train_coco_dataset/network_files/boxes.py
index f720df1f8..8eeca4573 100644
--- a/pytorch_object_detection/train_coco_dataset/network_files/boxes.py
+++ b/pytorch_object_detection/train_coco_dataset/network_files/boxes.py
@@ -23,7 +23,7 @@ def nms(boxes, scores, iou_threshold):
         scores for each one of the boxes
     iou_threshold : float
         discards all overlapping
-        boxes with IoU < iou_threshold
+        boxes with IoU > iou_threshold
 
     Returns
     -------
diff --git a/pytorch_object_detection/train_coco_dataset/network_files/faster_rcnn_framework.py b/pytorch_object_detection/train_coco_dataset/network_files/faster_rcnn_framework.py
index 20f9bccbd..d658b0113 100644
--- a/pytorch_object_detection/train_coco_dataset/network_files/faster_rcnn_framework.py
+++ b/pytorch_object_detection/train_coco_dataset/network_files/faster_rcnn_framework.py
@@ -245,7 +245,7 @@ class FasterRCNN(FasterRCNNBase):
 
     def __init__(self, backbone, num_classes=None,
                  # transform parameter
-                 min_size=800, max_size=1000,      # 预处理resize时限制的最小尺寸与最大尺寸
+                 min_size=800, max_size=1333,      # 预处理resize时限制的最小尺寸与最大尺寸
                  image_mean=None, image_std=None,  # 预处理normalize时使用的均值和方差
                  # RPN parameters
                  rpn_anchor_generator=None, rpn_head=None,
diff --git a/pytorch_object_detection/train_coco_dataset/predict.py b/pytorch_object_detection/train_coco_dataset/predict.py
index b74831cdf..2dc508d7e 100644
--- a/pytorch_object_detection/train_coco_dataset/predict.py
+++ b/pytorch_object_detection/train_coco_dataset/predict.py
@@ -6,20 +6,18 @@
 import torchvision
 from PIL import Image
 import matplotlib.pyplot as plt
-
 from torchvision import transforms
+from torchvision.models.feature_extraction import create_feature_extractor
+
 from network_files import FasterRCNN, AnchorsGenerator
-from backbone import vgg, MobileNetV2
-from draw_box_utils import draw_box
+from backbone import vgg, MobileNetV2, resnet50
+from draw_box_utils import draw_objs
 
 
 def create_model(num_classes):
-    vgg_feature = vgg(model_name="vgg16").features
-    backbone = torch.nn.Sequential(*list(vgg_feature._modules.values())[:-1])  # 删除feature中最后的maxpool层
-    backbone.out_channels = 512
-
-    # backbone = MobileNetV2().features
-    # backbone.out_channels = 1280  # 设置对应backbone输出特征矩阵的channels
+    res50 = resnet50()
+    backbone = create_feature_extractor(res50, return_nodes={"layer3": "0"})
+    backbone.out_channels = 1024
 
     anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),),
                                         aspect_ratios=((0.5, 1.0, 2.0),))
@@ -47,21 +45,22 @@ def main():
     print("using {} device.".format(device))
 
     # create model
-    num_classes = 80
-    model = create_model(num_classes=num_classes+1)
+    num_classes = 90  # 不包含背景
+    model = create_model(num_classes=num_classes + 1)
 
     # load train weights
-    train_weights = "./save_weights/model_25.pth"
-    assert os.path.exists(train_weights), "{} file dose not exist.".format(train_weights)
-    model.load_state_dict(torch.load(train_weights, map_location=device)["model"])
+    weights_path = "./save_weights/model_25.pth"
+    assert os.path.exists(weights_path), "{} file dose not exist.".format(weights_path)
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     model.to(device)
 
     # read class_indict
-    label_json_path = './coco80_indices.json'
+    label_json_path = './coco91_indices.json'
     assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
-    json_file = open(label_json_path, 'r')
-    category_index = json.load(json_file)
-    json_file.close()
+    with open(label_json_path, 'r') as f:
+        category_index = json.load(f)
 
     # load image
     original_img = Image.open("./test.jpg")
@@ -91,17 +90,19 @@ def main():
         if len(predict_boxes) == 0:
             print("没有检测到任何目标!")
 
-        draw_box(original_img,
-                 predict_boxes,
-                 predict_classes,
-                 predict_scores,
-                 category_index,
-                 thresh=0.5,
-                 line_thickness=3)
-        plt.imshow(original_img)
+        plot_img = draw_objs(original_img,
+                             predict_boxes,
+                             predict_classes,
+                             predict_scores,
+                             category_index=category_index,
+                             box_thresh=0.5,
+                             line_thickness=3,
+                             font='arial.ttf',
+                             font_size=20)
+        plt.imshow(plot_img)
         plt.show()
         # 保存预测的图片结果
-        original_img.save("test_result.jpg")
+        plot_img.save("test_result.jpg")
 
 
 if __name__ == '__main__':
diff --git a/pytorch_object_detection/train_coco_dataset/requirements.txt b/pytorch_object_detection/train_coco_dataset/requirements.txt
index 2d0345811..3a50c7ad2 100644
--- a/pytorch_object_detection/train_coco_dataset/requirements.txt
+++ b/pytorch_object_detection/train_coco_dataset/requirements.txt
@@ -4,5 +4,5 @@ numpy
 tqdm
 pycocotools
 Pillow
-torch==1.7.1
-torchvision==0.8.2
+torch==1.10
+torchvision==0.11.1
diff --git a/pytorch_object_detection/train_coco_dataset/results20210412-092355.txt b/pytorch_object_detection/train_coco_dataset/results20210412-092355.txt
deleted file mode 100644
index c0a250e10..000000000
--- a/pytorch_object_detection/train_coco_dataset/results20210412-092355.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-epoch:0 0.0413  0.109  0.0222  0.0107  0.0495  0.062  0.0567  0.0896  0.0919  0.0169  0.0882  0.1438  1.6263  0.01
-epoch:1 0.0986  0.2188  0.0751  0.0359  0.1154  0.1411  0.111  0.167  0.1703  0.0504  0.1853  0.2523  1.0788  0.01
-epoch:2 0.1258  0.2702  0.1021  0.0473  0.1483  0.1761  0.1359  0.2075  0.2132  0.0694  0.2369  0.3164  1.0252  0.01
-epoch:3 0.1369  0.2809  0.1176  0.0543  0.1551  0.1961  0.1433  0.2086  0.2133  0.0778  0.2312  0.3145  0.992  0.01
-epoch:4 0.1553  0.3137  0.1371  0.0635  0.1774  0.2202  0.1592  0.2367  0.2422  0.0918  0.2677  0.3499  0.9698  0.01
-epoch:5 0.1626  0.3245  0.1439  0.0663  0.1869  0.2251  0.1663  0.2442  0.2501  0.0958  0.2726  0.3581  0.9524  0.01
-epoch:6 0.1739  0.3442  0.1539  0.0752  0.2007  0.2348  0.1742  0.2617  0.2689  0.1091  0.3073  0.3663  0.9372  0.01
-epoch:7 0.1762  0.3417  0.1609  0.0756  0.1963  0.2461  0.1728  0.2529  0.2583  0.1034  0.2799  0.3733  0.9238  0.01
-epoch:8 0.1844  0.3551  0.1709  0.0792  0.2107  0.2535  0.1796  0.2716  0.2785  0.1134  0.3089  0.3964  0.9136  0.01
-epoch:9 0.1909  0.3631  0.1811  0.0857  0.2172  0.2603  0.1837  0.2731  0.2797  0.1231  0.3095  0.3914  0.9045  0.01
-epoch:10 0.1955  0.3684  0.1894  0.0858  0.2242  0.2667  0.1873  0.2756  0.282  0.1173  0.3138  0.3901  0.896  0.01
-epoch:11 0.1995  0.373  0.1932  0.0856  0.2234  0.2732  0.1889  0.2804  0.2874  0.1217  0.3172  0.4012  0.8883  0.01
-epoch:12 0.2067  0.38  0.2019  0.0884  0.2332  0.281  0.1962  0.2891  0.2959  0.1223  0.3251  0.4223  0.881  0.01
-epoch:13 0.2109  0.3912  0.2052  0.0935  0.2408  0.2837  0.2016  0.3053  0.3138  0.1343  0.3526  0.4342  0.8743  0.01
-epoch:14 0.2131  0.3901  0.206  0.0933  0.241  0.2885  0.2023  0.301  0.3086  0.13  0.3429  0.4311  0.8676  0.01
-epoch:15 0.216  0.3968  0.2136  0.0967  0.2432  0.2984  0.2028  0.2999  0.307  0.1338  0.3364  0.4383  0.8631  0.01
-epoch:16 0.2286  0.4099  0.2293  0.1024  0.2594  0.3138  0.2118  0.315  0.3228  0.1409  0.3608  0.4518  0.8302  0.001
-epoch:17 0.2296  0.4102  0.2283  0.1027  0.2577  0.3116  0.2131  0.316  0.3239  0.1422  0.3564  0.4546  0.8255  0.001
-epoch:18 0.2306  0.4125  0.2308  0.1031  0.2592  0.3155  0.213  0.3161  0.3242  0.1416  0.3585  0.4566  0.8239  0.001
-epoch:19 0.2324  0.4163  0.2327  0.1042  0.2618  0.3163  0.2146  0.3193  0.3273  0.1483  0.3635  0.4543  0.8221  0.001
-epoch:20 0.2306  0.4129  0.2293  0.103  0.2611  0.3147  0.2121  0.3143  0.3219  0.1394  0.359  0.4506  0.8216  0.001
-epoch:21 0.2325  0.4147  0.2338  0.1052  0.2623  0.3167  0.2157  0.3185  0.3263  0.1458  0.3625  0.4546  0.8208  0.001
-epoch:22 0.2321  0.4145  0.2313  0.1034  0.261  0.3226  0.2145  0.3181  0.3261  0.1428  0.3616  0.462  0.8159  0.0001
-epoch:23 0.232  0.4143  0.2305  0.1036  0.2613  0.3198  0.2139  0.3181  0.3261  0.1433  0.3617  0.4559  0.8162  0.0001
-epoch:24 0.2315  0.4136  0.2302  0.1032  0.2603  0.3209  0.2131  0.317  0.3249  0.1422  0.3598  0.4594  0.8161  0.0001
-epoch:25 0.232  0.4145  0.2317  0.1035  0.2614  0.3219  0.215  0.3183  0.3262  0.1444  0.3605  0.4601  0.8158  0.0001
diff --git a/pytorch_object_detection/train_coco_dataset/results20220408-201436.txt b/pytorch_object_detection/train_coco_dataset/results20220408-201436.txt
new file mode 100644
index 000000000..0927e308c
--- /dev/null
+++ b/pytorch_object_detection/train_coco_dataset/results20220408-201436.txt
@@ -0,0 +1,26 @@
+epoch:0 0.0504  0.1144  0.0362  0.0207  0.0601  0.0657  0.0702  0.1069  0.1087  0.0335  0.1153  0.1486  1.7430  0.005000
+epoch:1 0.1138  0.2300  0.0994  0.0494  0.1279  0.1554  0.1303  0.1940  0.1980  0.0747  0.2051  0.2831  1.2282  0.005000
+epoch:2 0.1461  0.2773  0.1394  0.0636  0.1635  0.1997  0.1530  0.2243  0.2288  0.0938  0.2435  0.3309  1.1391  0.005000
+epoch:3 0.1669  0.3134  0.1642  0.0750  0.1843  0.2282  0.1680  0.2509  0.2561  0.1091  0.2705  0.3701  1.0902  0.005000
+epoch:4 0.1857  0.3389  0.1828  0.0829  0.2074  0.2568  0.1830  0.2708  0.2756  0.1140  0.2937  0.3998  1.0581  0.005000
+epoch:5 0.1908  0.3431  0.1930  0.0901  0.2128  0.2578  0.1839  0.2704  0.2753  0.1197  0.2927  0.3893  1.0337  0.005000
+epoch:6 0.2044  0.3634  0.2077  0.0954  0.2247  0.2796  0.1947  0.2893  0.2956  0.1317  0.3138  0.4178  1.0127  0.005000
+epoch:7 0.2068  0.3651  0.2099  0.0953  0.2269  0.2840  0.1959  0.2869  0.2926  0.1290  0.3093  0.4186  0.9945  0.005000
+epoch:8 0.2171  0.3788  0.2218  0.0996  0.2470  0.2969  0.2012  0.3001  0.3071  0.1329  0.3375  0.4371  0.9806  0.005000
+epoch:9 0.2146  0.3717  0.2207  0.0946  0.2315  0.3038  0.2011  0.2910  0.2962  0.1277  0.3091  0.4321  0.9691  0.005000
+epoch:10 0.2280  0.3974  0.2345  0.1035  0.2535  0.3108  0.2118  0.3119  0.3182  0.1402  0.3429  0.4537  0.9567  0.005000
+epoch:11 0.2332  0.3983  0.2443  0.1111  0.2534  0.3149  0.2136  0.3128  0.3190  0.1515  0.3417  0.4438  0.9450  0.005000
+epoch:12 0.2400  0.4094  0.2486  0.1102  0.2622  0.3251  0.2175  0.3214  0.3289  0.1507  0.3521  0.4588  0.9369  0.005000
+epoch:13 0.2449  0.4152  0.2563  0.1121  0.2741  0.3308  0.2234  0.3286  0.3363  0.1552  0.3703  0.4627  0.9286  0.005000
+epoch:14 0.2466  0.4192  0.2542  0.1131  0.2765  0.3412  0.2220  0.3258  0.3322  0.1481  0.3627  0.4776  0.9203  0.005000
+epoch:15 0.2492  0.4216  0.2569  0.1147  0.2781  0.3417  0.2254  0.3337  0.3402  0.1565  0.3666  0.4893  0.9116  0.005000
+epoch:16 0.2689  0.4433  0.2814  0.1246  0.2963  0.3705  0.2384  0.3495  0.3569  0.1671  0.3864  0.5046  0.8616  0.000500
+epoch:17 0.2719  0.4473  0.2865  0.1243  0.3021  0.3743  0.2399  0.3519  0.3593  0.1669  0.3931  0.5017  0.8515  0.000500
+epoch:18 0.2738  0.4521  0.2857  0.1256  0.3048  0.3718  0.2416  0.3564  0.3645  0.1713  0.3996  0.5037  0.8472  0.000500
+epoch:19 0.2759  0.4534  0.2893  0.1259  0.3094  0.3719  0.2448  0.3603  0.3681  0.1691  0.4073  0.5055  0.8439  0.000500
+epoch:20 0.2720  0.4483  0.2838  0.1250  0.3021  0.3681  0.2400  0.3532  0.3613  0.1688  0.3944  0.4994  0.8417  0.000500
+epoch:21 0.2748  0.4501  0.2904  0.1241  0.3019  0.3759  0.2421  0.3561  0.3641  0.1682  0.3941  0.5101  0.8378  0.000500
+epoch:22 0.2754  0.4532  0.2896  0.1281  0.3064  0.3759  0.2419  0.3586  0.3660  0.1712  0.3993  0.5115  0.8304  0.000050
+epoch:23 0.2757  0.4516  0.2907  0.1271  0.3068  0.3748  0.2423  0.3572  0.3650  0.1692  0.4005  0.5087  0.8307  0.000050
+epoch:24 0.2750  0.4500  0.2888  0.1256  0.3017  0.3760  0.2411  0.3536  0.3611  0.1669  0.3894  0.5040  0.8299  0.000050
+epoch:25 0.2769  0.4537  0.2903  0.1263  0.3082  0.3782  0.2424  0.3582  0.3663  0.1693  0.4020  0.5116  0.8281  0.000050
diff --git a/pytorch_object_detection/train_coco_dataset/train.py b/pytorch_object_detection/train_coco_dataset/train.py
index b02200d39..4b068a3ec 100644
--- a/pytorch_object_detection/train_coco_dataset/train.py
+++ b/pytorch_object_detection/train_coco_dataset/train.py
@@ -6,20 +6,30 @@
 
 import transforms
 from network_files import FasterRCNN, AnchorsGenerator
-from backbone import MobileNetV2, vgg
+from backbone import MobileNetV2, vgg, resnet50
 from my_dataset import CocoDetection
 from train_utils import train_eval_utils as utils
+from train_utils import GroupedBatchSampler, create_aspect_ratio_groups
+from torchvision.models.feature_extraction import create_feature_extractor
 
 
 def create_model(num_classes):
-    # https://download.pytorch.org/models/vgg16-397923af.pth
-    # 如果使用mobilenetv2的话就下载对应预训练权重并注释下面三行，接着把mobilenetv2模型对应的两行代码注释取消掉
-    vgg_feature = vgg(model_name="vgg16", weights_path="./backbone/vgg16.pth").features
-    backbone = torch.nn.Sequential(*list(vgg_feature._modules.values())[:-1])  # 删除feature中最后的maxpool层
-    backbone.out_channels = 512
-
-    # https://download.pytorch.org/models/mobilenet_v2-b0353104.pth
-    # backbone = MobileNetV2(weights_path="./backbone/mobilenet_v2.pth").features
+    # 以vgg16为backbone
+    # 预训练权重地址： https://download.pytorch.org/models/vgg16-397923af.pth
+    # vgg16 = vgg(model_name="vgg16", weights_path="./vgg16.pth")
+    # backbone = create_feature_extractor(vgg16, return_nodes={"features.29": "0"})  # 删除feature中最后的maxpool层
+    # backbone.out_channels = 512
+
+    # 以resnet50为backbone
+    # 预训练权重地址：https://download.pytorch.org/models/resnet50-19c8e357.pth
+    res50 = resnet50()
+    res50.load_state_dict(torch.load("./resnet50.pth", map_location="cpu"))
+    backbone = create_feature_extractor(res50, return_nodes={"layer3": "0"})
+    backbone.out_channels = 1024
+
+    # 以mobilenetv2为backbone
+    # 预训练权重地址：https://download.pytorch.org/models/mobilenet_v2-b0353104.pth
+    # backbone = MobileNetV2(weights_path="./mobilenet_v2.pth").features
     # backbone.out_channels = 1280  # 设置对应backbone输出特征矩阵的channels
 
     anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),),
@@ -54,29 +64,49 @@ def main(args):
 
     # load train data set
     # coco2017 -> annotations -> instances_train2017.json
-    train_data_set = CocoDetection(COCO_root, "train", data_transform["train"])
+    train_dataset = CocoDetection(COCO_root, "train", data_transform["train"])
+    train_sampler = None
+
+    # 是否按图片相似高宽比采样图片组成batch
+    # 使用的话能够减小训练时所需GPU显存，默认使用
+    if args.aspect_ratio_group_factor >= 0:
+        train_sampler = torch.utils.data.RandomSampler(train_dataset)
+        # 统计所有图像高宽比例在bins区间中的位置索引
+        group_ids = create_aspect_ratio_groups(train_dataset, k=args.aspect_ratio_group_factor)
+        # 每个batch图片从同一高宽比例区间中取
+        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
+
     # 注意这里的collate_fn是自定义的，因为读取的数据包括image和targets，不能直接使用默认的方法合成batch
     batch_size = args.batch_size
     nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
     print('Using %g dataloader workers' % nw)
-    train_data_loader = torch.utils.data.DataLoader(train_data_set,
-                                                    batch_size=batch_size,
-                                                    shuffle=True,
-                                                    pin_memory=True,
-                                                    num_workers=nw,
-                                                    collate_fn=train_data_set.collate_fn)
+
+    if train_sampler:
+        # 如果按照图片高宽比采样图片，dataloader中需要使用batch_sampler
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_sampler=train_batch_sampler,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
+    else:
+        train_data_loader = torch.utils.data.DataLoader(train_dataset,
+                                                        batch_size=batch_size,
+                                                        shuffle=True,
+                                                        pin_memory=True,
+                                                        num_workers=nw,
+                                                        collate_fn=train_dataset.collate_fn)
 
     # load validation data set
     # coco2017 -> annotations -> instances_val2017.json
-    val_data_set = CocoDetection(COCO_root, "val", data_transform["val"])
-    val_data_set_loader = torch.utils.data.DataLoader(val_data_set,
-                                                      batch_size=batch_size,
-                                                      shuffle=False,
-                                                      pin_memory=True,
-                                                      num_workers=nw,
-                                                      collate_fn=train_data_set.collate_fn)
-
-    # create model num_classes equal background + 80 classes
+    val_dataset = CocoDetection(COCO_root, "val", data_transform["val"])
+    val_data_loader = torch.utils.data.DataLoader(val_dataset,
+                                                  batch_size=1,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=nw,
+                                                  collate_fn=train_dataset.collate_fn)
+
+    # create model num_classes equal background + classes
     model = create_model(num_classes=args.num_classes + 1)
     # print(model)
 
@@ -123,12 +153,12 @@ def main(args):
         lr_scheduler.step()
 
         # evaluate on the test dataset
-        coco_info = utils.evaluate(model, val_data_set_loader, device=device)
+        coco_info = utils.evaluate(model, val_data_loader, device=device)
 
         # write into txt
         with open(results_file, "a") as f:
             # 写入的数据包括coco指标还有loss和learning rate
-            result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
+            result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
             txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
             f.write(txt + "\n")
 
@@ -166,7 +196,7 @@ def main(args):
     # 训练数据集的根目录
     parser.add_argument('--data-path', default='/data/coco2017', help='dataset')
     # 检测目标类别数(不包含背景)
-    parser.add_argument('--num-classes', default=80, type=int, help='num_classes')
+    parser.add_argument('--num-classes', default=90, type=int, help='num_classes')
     # 文件保存地址
     parser.add_argument('--output-dir', default='./save_weights', help='path where to save')
     # 若需要接着上次训练，则指定上次训练保存权重文件地址
@@ -177,7 +207,7 @@ def main(args):
     parser.add_argument('--epochs', default=26, type=int, metavar='N',
                         help='number of total epochs to run')
     # 学习率
-    parser.add_argument('--lr', default=0.002, type=float,
+    parser.add_argument('--lr', default=0.005, type=float,
                         help='initial learning rate, 0.02 is the default value for training '
                              'on 8 gpus and 2 images_per_gpu')
     # SGD的momentum参数
@@ -193,8 +223,9 @@ def main(args):
     # 针对torch.optim.lr_scheduler.MultiStepLR的参数
     parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
     # 训练的batch size(如果内存/GPU显存充裕，建议设置更大)
-    parser.add_argument('--batch_size', default=2, type=int, metavar='N',
+    parser.add_argument('--batch_size', default=4, type=int, metavar='N',
                         help='batch size when training.')
+    parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
     # 是否使用混合精度训练(需要GPU支持混合精度)
     parser.add_argument("--amp", default=False, help="Use torch.cuda.amp for mixed precision training")
 
diff --git a/pytorch_object_detection/train_coco_dataset/train_multi_GPU.py b/pytorch_object_detection/train_coco_dataset/train_multi_GPU.py
index ccc9cd364..5ee8e5303 100644
--- a/pytorch_object_detection/train_coco_dataset/train_multi_GPU.py
+++ b/pytorch_object_detection/train_coco_dataset/train_multi_GPU.py
@@ -7,22 +7,20 @@
 
 import transforms
 from my_dataset import CocoDetection
-from backbone import vgg
+from backbone import resnet50
 from network_files import FasterRCNN, AnchorsGenerator
 import train_utils.train_eval_utils as utils
 from train_utils import GroupedBatchSampler, create_aspect_ratio_groups, init_distributed_mode, save_on_master, mkdir
+from torchvision.models.feature_extraction import create_feature_extractor
 
 
 def create_model(num_classes):
-    # https://download.pytorch.org/models/vgg16-397923af.pth
-    # 如果使用mobilenetv2的话就下载对应预训练权重并注释下面三行，接着把mobilenetv2模型对应的两行代码注释取消掉
-    vgg_feature = vgg(model_name="vgg16", weights_path="./backbone/vgg16.pth").features
-    backbone = torch.nn.Sequential(*list(vgg_feature._modules.values())[:-1])  # 删除feature中最后的maxpool层
-    backbone.out_channels = 512
-
-    # https://download.pytorch.org/models/mobilenet_v2-b0353104.pth
-    # backbone = MobileNetV2(weights_path="./backbone/mobilenet_v2.pth").features
-    # backbone.out_channels = 1280  # 设置对应backbone输出特征矩阵的channels
+    # 以resnet50为backbone
+    # 预训练权重地址：https://download.pytorch.org/models/resnet50-19c8e357.pth
+    res50 = resnet50()
+    res50.load_state_dict(torch.load("./resnet50.pth", map_location="cpu"))
+    backbone = create_feature_extractor(res50, return_nodes={"layer3": "0"})
+    backbone.out_channels = 1024
 
     anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),),
                                         aspect_ratios=((0.5, 1.0, 2.0),))
@@ -61,42 +59,45 @@ def main(args):
 
     # load train data set
     # coco2017 -> annotations -> instances_train2017.json
-    train_data_set = CocoDetection(COCO_root, "train", data_transform["train"])
+    train_dataset = CocoDetection(COCO_root, "train", data_transform["train"])
 
     # load validation data set
     # coco2017 -> annotations -> instances_val2017.json
-    val_data_set = CocoDetection(COCO_root, "val", data_transform["val"])
+    val_dataset = CocoDetection(COCO_root, "val", data_transform["val"])
 
     print("Creating data loaders")
     if args.distributed:
-        train_sampler = torch.utils.data.distributed.DistributedSampler(train_data_set)
-        test_sampler = torch.utils.data.distributed.DistributedSampler(val_data_set)
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        test_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
     else:
-        train_sampler = torch.utils.data.RandomSampler(train_data_set)
-        test_sampler = torch.utils.data.SequentialSampler(val_data_set)
+        train_sampler = torch.utils.data.RandomSampler(train_dataset)
+        test_sampler = torch.utils.data.SequentialSampler(val_dataset)
 
     if args.aspect_ratio_group_factor >= 0:
         # 统计所有图像比例在bins区间中的位置索引
-        group_ids = create_aspect_ratio_groups(train_data_set, k=args.aspect_ratio_group_factor)
+        group_ids = create_aspect_ratio_groups(train_dataset, k=args.aspect_ratio_group_factor)
         train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
     else:
         train_batch_sampler = torch.utils.data.BatchSampler(
             train_sampler, args.batch_size, drop_last=True)
 
     data_loader = torch.utils.data.DataLoader(
-        train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers,
-        collate_fn=train_data_set.collate_fn)
+        train_dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
+        collate_fn=train_dataset.collate_fn)
 
     data_loader_test = torch.utils.data.DataLoader(
-        val_data_set, batch_size=1,
+        val_dataset, batch_size=1,
         sampler=test_sampler, num_workers=args.workers,
-        collate_fn=train_data_set.collate_fn)
+        collate_fn=train_dataset.collate_fn)
 
     print("Creating model")
-    # create model num_classes equal background + 80 classes
+    # create model num_classes equal background + classes
     model = create_model(num_classes=args.num_classes + 1)
     model.to(device)
 
+    if args.distributed and args.sync_bn:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
     model_without_ddp = model
     if args.distributed:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
@@ -152,7 +153,7 @@ def main(args):
             # write into txt
             with open(results_file, "a") as f:
                 # 写入的数据包括coco指标还有loss和learning rate
-                result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
+                result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
                 txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
                 f.write(txt + "\n")
 
@@ -195,9 +196,9 @@ def main(args):
     # 训练设备类型
     parser.add_argument('--device', default='cuda', help='device')
     # 检测目标类别数(不包含背景)
-    parser.add_argument('--num-classes', default=80, type=int, help='num_classes')
+    parser.add_argument('--num-classes', default=90, type=int, help='num_classes')
     # 每块GPU上的batch_size
-    parser.add_argument('-b', '--batch-size', default=16, type=int,
+    parser.add_argument('-b', '--batch-size', default=4, type=int,
                         help='images per gpu, the total batch size is $NGPU x batch_size')
     # 指定接着从哪个epoch数开始训练
     parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
@@ -237,6 +238,7 @@ def main(args):
     parser.add_argument('--world-size', default=4, type=int,
                         help='number of distributed processes')
     parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+    parser.add_argument("--sync-bn", dest="sync_bn", help="Use sync batch norm", type=bool, default=False)
     # 是否使用混合精度训练(需要GPU支持混合精度)
     parser.add_argument("--amp", default=False, help="Use torch.cuda.amp for mixed precision training")
 
diff --git a/pytorch_object_detection/train_coco_dataset/train_utils/__init__.py b/pytorch_object_detection/train_coco_dataset/train_utils/__init__.py
index 78167b64d..ce519bc94 100644
--- a/pytorch_object_detection/train_coco_dataset/train_utils/__init__.py
+++ b/pytorch_object_detection/train_coco_dataset/train_utils/__init__.py
@@ -1,2 +1,3 @@
 from .group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
 from .distributed_utils import init_distributed_mode, save_on_master, mkdir
+from .coco_eval import EvalCOCOMetric
diff --git a/pytorch_object_detection/train_coco_dataset/train_utils/coco_eval.py b/pytorch_object_detection/train_coco_dataset/train_utils/coco_eval.py
new file mode 100644
index 000000000..b8df0204d
--- /dev/null
+++ b/pytorch_object_detection/train_coco_dataset/train_utils/coco_eval.py
@@ -0,0 +1,163 @@
+import json
+import copy
+
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import pycocotools.mask as mask_util
+from .distributed_utils import all_gather, is_main_process
+
+
+def merge(img_ids, eval_results):
+    """将多个进程之间的数据汇总在一起"""
+    all_img_ids = all_gather(img_ids)
+    all_eval_results = all_gather(eval_results)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_results = []
+    for p in all_eval_results:
+        merged_eval_results.extend(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+
+    # keep only unique (and in sorted order) images
+    # 去除重复的图片索引，多GPU训练时为了保证每个进程的训练图片数量相同，可能将一张图片分配给多个进程
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_results = [merged_eval_results[i] for i in idx]
+
+    return list(merged_img_ids), merged_eval_results
+
+
+class EvalCOCOMetric:
+    def __init__(self,
+                 coco: COCO = None,
+                 iou_type: str = None,
+                 results_file_name: str = "predict_results.json",
+                 classes_mapping: dict = None):
+        self.coco = copy.deepcopy(coco)
+        self.img_ids = []  # 记录每个进程处理图片的ids
+        self.results = []
+        self.aggregation_results = None
+        self.classes_mapping = classes_mapping
+        self.coco_evaluator = None
+        assert iou_type in ["bbox", "segm", "keypoints"]
+        self.iou_type = iou_type
+        self.results_file_name = results_file_name
+
+    def prepare_for_coco_detection(self, targets, outputs):
+        """将预测的结果转换成COCOeval指定的格式，针对目标检测任务"""
+        # 遍历每张图像的预测结果
+        for target, output in zip(targets, outputs):
+            if len(output) == 0:
+                continue
+
+            img_id = int(target["image_id"])
+            if img_id in self.img_ids:
+                # 防止出现重复的数据
+                continue
+            self.img_ids.append(img_id)
+            per_image_boxes = output["boxes"]
+            # 对于coco_eval, 需要的每个box的数据格式为[x_min, y_min, w, h]
+            # 而我们预测的box格式是[x_min, y_min, x_max, y_max]，所以需要转下格式
+            per_image_boxes[:, 2:] -= per_image_boxes[:, :2]
+            per_image_classes = output["labels"].tolist()
+            per_image_scores = output["scores"].tolist()
+
+            res_list = []
+            # 遍历每个目标的信息
+            for object_score, object_class, object_box in zip(
+                    per_image_scores, per_image_classes, per_image_boxes):
+                object_score = float(object_score)
+                class_idx = int(object_class)
+                if self.classes_mapping is not None:
+                    class_idx = int(self.classes_mapping[str(class_idx)])
+                # We recommend rounding coordinates to the nearest tenth of a pixel
+                # to reduce resulting JSON file size.
+                object_box = [round(b, 2) for b in object_box.tolist()]
+
+                res = {"image_id": img_id,
+                       "category_id": class_idx,
+                       "bbox": object_box,
+                       "score": round(object_score, 3)}
+                res_list.append(res)
+            self.results.append(res_list)
+
+    def prepare_for_coco_segmentation(self, targets, outputs):
+        """将预测的结果转换成COCOeval指定的格式，针对实例分割任务"""
+        # 遍历每张图像的预测结果
+        for target, output in zip(targets, outputs):
+            if len(output) == 0:
+                continue
+
+            img_id = int(target["image_id"])
+            if img_id in self.img_ids:
+                # 防止出现重复的数据
+                continue
+
+            self.img_ids.append(img_id)
+            per_image_masks = output["masks"]
+            per_image_classes = output["labels"].tolist()
+            per_image_scores = output["scores"].tolist()
+
+            masks = per_image_masks > 0.5
+
+            res_list = []
+            # 遍历每个目标的信息
+            for mask, label, score in zip(masks, per_image_classes, per_image_scores):
+                rle = mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+                class_idx = int(label)
+                if self.classes_mapping is not None:
+                    class_idx = int(self.classes_mapping[str(class_idx)])
+
+                res = {"image_id": img_id,
+                       "category_id": class_idx,
+                       "segmentation": rle,
+                       "score": round(score, 3)}
+                res_list.append(res)
+            self.results.append(res_list)
+
+    def update(self, targets, outputs):
+        if self.iou_type == "bbox":
+            self.prepare_for_coco_detection(targets, outputs)
+        elif self.iou_type == "segm":
+            self.prepare_for_coco_segmentation(targets, outputs)
+        else:
+            raise KeyError(f"not support iou_type: {self.iou_type}")
+
+    def synchronize_results(self):
+        # 同步所有进程中的数据
+        eval_ids, eval_results = merge(self.img_ids, self.results)
+        self.aggregation_results = {"img_ids": eval_ids, "results": eval_results}
+
+        # 主进程上保存即可
+        if is_main_process():
+            results = []
+            [results.extend(i) for i in eval_results]
+            # write predict results into json file
+            json_str = json.dumps(results, indent=4)
+            with open(self.results_file_name, 'w') as json_file:
+                json_file.write(json_str)
+
+    def evaluate(self):
+        # 只在主进程上评估即可
+        if is_main_process():
+            # accumulate predictions from all images
+            coco_true = self.coco
+            coco_pre = coco_true.loadRes(self.results_file_name)
+
+            self.coco_evaluator = COCOeval(cocoGt=coco_true, cocoDt=coco_pre, iouType=self.iou_type)
+
+            self.coco_evaluator.evaluate()
+            self.coco_evaluator.accumulate()
+            print(f"IoU metric: {self.iou_type}")
+            self.coco_evaluator.summarize()
+
+            coco_info = self.coco_evaluator.stats.tolist()  # numpy to list
+            return coco_info
+        else:
+            return None
diff --git a/pytorch_object_detection/train_coco_dataset/train_utils/distributed_utils.py b/pytorch_object_detection/train_coco_dataset/train_utils/distributed_utils.py
index 95d0b11e1..80b2412c6 100644
--- a/pytorch_object_detection/train_coco_dataset/train_utils/distributed_utils.py
+++ b/pytorch_object_detection/train_coco_dataset/train_utils/distributed_utils.py
@@ -83,38 +83,8 @@ def all_gather(data):
     if world_size == 1:
         return [data]
 
-    # serialized to a Tensor
-    # 将数据转为tensor
-    buffer = pickle.dumps(data)
-    storage = torch.ByteStorage.from_buffer(buffer)
-    tensor = torch.ByteTensor(storage).to("cuda")
-
-    # obtain Tensor size of each rank
-    # 获取每个进程中tensor的大小，并求最大值
-    local_size = torch.tensor([tensor.numel()], device="cuda")
-    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
-    dist.all_gather(size_list, local_size)
-    size_list = [int(size.item()) for size in size_list]
-    max_size = max(size_list)
-
-    # receiving Tensor from all ranks
-    # we pad the tensor because torch all_gather does not support
-    # gathering tensors of different shapes
-    # 由于现在all_gather方法只能传播相同长度的数据，所以需要pad处理
-    tensor_list = []
-    for _ in size_list:
-        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
-    if local_size != max_size:
-        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
-        tensor = torch.cat((tensor, padding), dim=0)
-    dist.all_gather(tensor_list, tensor)
-
-    # 将从各个进程中获取得到的数据整合在一起
-    # 注意要将多余的pad给删除掉
-    data_list = []
-    for size, tensor in zip(size_list, tensor_list):
-        buffer = tensor.cpu().numpy().tobytes()[:size]
-        data_list.append(pickle.loads(buffer))
+    data_list = [None] * world_size
+    dist.all_gather_object(data_list, data)
 
     return data_list
 
diff --git a/pytorch_object_detection/train_coco_dataset/train_utils/train_eval_utils.py b/pytorch_object_detection/train_coco_dataset/train_utils/train_eval_utils.py
index b3710a208..ba009fa43 100644
--- a/pytorch_object_detection/train_coco_dataset/train_utils/train_eval_utils.py
+++ b/pytorch_object_detection/train_coco_dataset/train_utils/train_eval_utils.py
@@ -1,12 +1,11 @@
 import math
 import sys
 import time
-import json
 
 import torch
-from pycocotools.cocoeval import COCOeval
 
 import train_utils.distributed_utils as utils
+from .coco_eval import EvalCOCOMetric
 
 
 def train_one_epoch(model, optimizer, data_loader, device, epoch,
@@ -73,9 +72,7 @@ def evaluate(model, data_loader, device):
     metric_logger = utils.MetricLogger(delimiter="  ")
     header = "Test: "
 
-    coco91to80 = data_loader.dataset.coco91to80
-    coco80to91 = dict([(str(v), k) for k, v in coco91to80.items()])
-    results = []
+    det_metric = EvalCOCOMetric(data_loader.dataset.coco, iou_type="bbox", results_file_name="det_results.json")
     for image, targets in metric_logger.log_every(data_loader, 100, header):
         image = list(img.to(device) for img in image)
 
@@ -89,36 +86,7 @@ def evaluate(model, data_loader, device):
         outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
         model_time = time.time() - model_time
 
-        # 遍历每张图像的预测结果
-        for target, output in zip(targets, outputs):
-            if len(output) == 0:
-                continue
-
-            img_id = int(target["image_id"])
-            per_image_boxes = output["boxes"]
-            # 对于coco_eval, 需要的每个box的数据格式为[x_min, y_min, w, h]
-            # 而我们预测的box格式是[x_min, y_min, x_max, y_max]，所以需要转下格式
-            per_image_boxes[:, 2:] -= per_image_boxes[:, :2]
-            per_image_classes = output["labels"]
-            per_image_scores = output["scores"]
-
-            # 遍历每个目标的信息
-            for object_score, object_class, object_box in zip(
-                    per_image_scores, per_image_classes, per_image_boxes):
-                object_score = float(object_score)
-                # 要将类别信息还原回coco91中
-                coco80_class = int(object_class)
-                coco91_class = int(coco80to91[str(coco80_class)])
-                # We recommend rounding coordinates to the nearest tenth of a pixel
-                # to reduce resulting JSON file size.
-                object_box = [round(b, 2) for b in object_box.tolist()]
-
-                res = {"image_id": img_id,
-                       "category_id": coco91_class,
-                       "bbox": object_box,
-                       "score": round(object_score, 3)}
-                results.append(res)
-
+        det_metric.update(targets, outputs)
         metric_logger.update(model_time=model_time)
 
     # gather the stats from all processes
@@ -126,29 +94,10 @@ def evaluate(model, data_loader, device):
     print("Averaged stats:", metric_logger)
 
     # 同步所有进程中的数据
-    all_results = utils.all_gather(results)
+    det_metric.synchronize_results()
 
     if utils.is_main_process():
-        # 将所有进程上的数据合并到一个list当中
-        results = []
-        for res in all_results:
-            results.extend(res)
-
-        # write predict results into json file
-        json_str = json.dumps(results, indent=4)
-        with open('predict_tmp.json', 'w') as json_file:
-            json_file.write(json_str)
-
-        # accumulate predictions from all images
-        coco_true = data_loader.dataset.coco
-        coco_pre = coco_true.loadRes('predict_tmp.json')
-
-        coco_evaluator = COCOeval(cocoGt=coco_true, cocoDt=coco_pre, iouType="bbox")
-        coco_evaluator.evaluate()
-        coco_evaluator.accumulate()
-        coco_evaluator.summarize()
-
-        coco_info = coco_evaluator.stats.tolist()  # numpy to list
+        coco_info = det_metric.evaluate()
     else:
         coco_info = None
 
diff --git a/pytorch_object_detection/train_coco_dataset/validation.py b/pytorch_object_detection/train_coco_dataset/validation.py
index e6ea58dae..98a230f77 100644
--- a/pytorch_object_detection/train_coco_dataset/validation.py
+++ b/pytorch_object_detection/train_coco_dataset/validation.py
@@ -10,12 +10,13 @@
 import torchvision
 from tqdm import tqdm
 import numpy as np
-from pycocotools.cocoeval import COCOeval
+from torchvision.models.feature_extraction import create_feature_extractor
 
 import transforms
 from network_files import FasterRCNN, AnchorsGenerator
 from my_dataset import CocoDetection
-from backbone import vgg
+from backbone import resnet50
+from train_utils import EvalCOCOMetric
 
 
 def summarize(self, catId=None):
@@ -99,11 +100,10 @@ def main(parser_data):
     }
 
     # read class_indict
-    label_json_path = './coco80_indices.json'
+    label_json_path = './coco91_indices.json'
     assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
-    json_file = open(label_json_path, 'r')
-    category_index = json.load(json_file)
-    json_file.close()
+    with open(label_json_path, 'r') as f:
+        category_index = json.load(f)
 
     coco_root = parser_data.data_path
 
@@ -122,9 +122,9 @@ def main(parser_data):
                                                      collate_fn=val_dataset.collate_fn)
 
     # create model
-    vgg_feature = vgg(model_name="vgg16", weights_path="./backbone/vgg16.pth").features
-    backbone = torch.nn.Sequential(*list(vgg_feature._modules.values())[:-1])  # 删除feature中最后的maxpool层
-    backbone.out_channels = 512
+    res50 = resnet50()
+    backbone = create_feature_extractor(res50, return_nodes={"layer3": "0"})
+    backbone.out_channels = 1024
 
     anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),),
                                         aspect_ratios=((0.5, 1.0, 2.0),))
@@ -140,19 +140,19 @@ def main(parser_data):
                        box_roi_pool=roi_pooler)
 
     # 载入你自己训练好的模型权重
-    weights_path = parser_data.weights
+    weights_path = parser_data.weights_path
     assert os.path.exists(weights_path), "not found {} file.".format(weights_path)
-    model.load_state_dict(torch.load(weights_path, map_location=device)['model'])
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     # print(model)
 
     model.to(device)
 
     # evaluate on the val dataset
     cpu_device = torch.device("cpu")
-    coco91to80 = val_dataset.coco91to80
-    coco80to91 = dict([(str(v), k) for k, v in coco91to80.items()])
-    results = []
 
+    det_metric = EvalCOCOMetric(val_dataset.coco, "bbox", "det_results.json")
     model.eval()
     with torch.no_grad():
         for image, targets in tqdm(val_dataset_loader, desc="validation..."):
@@ -161,62 +161,21 @@ def main(parser_data):
 
             # inference
             outputs = model(image)
-
             outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
+            det_metric.update(targets, outputs)
 
-            # 遍历每张图像的预测结果
-            for target, output in zip(targets, outputs):
-                if len(output) == 0:
-                    continue
-
-                img_id = int(target["image_id"])
-                per_image_boxes = output["boxes"]
-                # 对于coco_eval, 需要的每个box的数据格式为[x_min, y_min, w, h]
-                # 而我们预测的box格式是[x_min, y_min, x_max, y_max]，所以需要转下格式
-                per_image_boxes[:, 2:] -= per_image_boxes[:, :2]
-                per_image_classes = output["labels"]
-                per_image_scores = output["scores"]
-
-                # 遍历每个目标的信息
-                for object_score, object_class, object_box in zip(
-                        per_image_scores, per_image_classes, per_image_boxes):
-                    object_score = float(object_score)
-                    # 要将类别信息还原回coco91中
-                    coco80_class = int(object_class)
-                    coco91_class = int(coco80to91[str(coco80_class)])
-                    # We recommend rounding coordinates to the nearest tenth of a pixel
-                    # to reduce resulting JSON file size.
-                    object_box = [round(b, 2) for b in object_box.tolist()]
-
-                    res = {"image_id": img_id,
-                           "category_id": coco91_class,
-                           "bbox": object_box,
-                           "score": round(object_score, 3)}
-                    results.append(res)
-
-    # accumulate predictions from all images
-    # write predict results into json file
-    json_str = json.dumps(results, indent=4)
-    with open('predict_tmp.json', 'w') as json_file:
-        json_file.write(json_str)
-
-    # accumulate predictions from all images
-    coco_true = val_dataset.coco
-    coco_pre = coco_true.loadRes('predict_tmp.json')
-
-    coco_evaluator = COCOeval(cocoGt=coco_true, cocoDt=coco_pre, iouType="bbox")
-    coco_evaluator.evaluate()
-    coco_evaluator.accumulate()
-    coco_evaluator.summarize()
+    det_metric.synchronize_results()
+    det_metric.evaluate()
 
     # calculate COCO info for all classes
-    coco_stats, print_coco = summarize(coco_evaluator)
+    coco_stats, print_coco = summarize(det_metric.coco_evaluator)
 
     # calculate voc info for every classes(IoU=0.5)
     voc_map_info_list = []
-    for i in range(len(category_index)):
-        stats, _ = summarize(coco_evaluator, catId=i)
-        voc_map_info_list.append(" {:15}: {}".format(category_index[str(i + 1)], stats[1]))
+    classes = [v for v in category_index.values() if v != "N/A"]
+    for i in range(len(classes)):
+        stats, _ = summarize(det_metric.coco_evaluator, catId=i)
+        voc_map_info_list.append(" {:15}: {}".format(classes[i], stats[1]))
 
     print_voc = "\n".join(voc_map_info_list)
     print(print_voc)
@@ -241,13 +200,13 @@ def main(parser_data):
     parser.add_argument('--device', default='cuda', help='device')
 
     # 检测目标类别数
-    parser.add_argument('--num-classes', type=int, default='80', help='number of classes')
+    parser.add_argument('--num-classes', type=int, default=90, help='number of classes')
 
     # 数据集的根目录(coco2017根目录)
     parser.add_argument('--data-path', default='/data/coco2017', help='dataset root')
 
     # 训练好的权重文件
-    parser.add_argument('--weights', default='./save_weights/model.pth', type=str, help='training weights')
+    parser.add_argument('--weights-path', default='./save_weights/model.pth', type=str, help='training weights')
 
     # batch size
     parser.add_argument('--batch_size', default=1, type=int, metavar='N',
diff --git a/pytorch_object_detection/yolov3_spp/README.md b/pytorch_object_detection/yolov3_spp/README.md
index aee46f384..9d9301a2e 100644
--- a/pytorch_object_detection/yolov3_spp/README.md
+++ b/pytorch_object_detection/yolov3_spp/README.md
@@ -3,9 +3,9 @@
 ## 1 环境配置：
 * Python3.6或者3.7
 * Pytorch1.7.1(注意：必须是1.6.0或以上，因为使用官方提供的混合精度训练1.6.0后才支持)
-* pycocotools(Linux: ```pip install pycocotools```;   
-  Windows: ```pip install pycocotools-windows```(不需要额外安装vs))
-* 更多环境配置信息，请查看```requirements.txt```文件
+* pycocotools(Linux: `pip install pycocotools`;   
+  Windows: `pip install pycocotools-windows`(不需要额外安装vs))
+* 更多环境配置信息，请查看`requirements.txt`文件
 * 最好使用GPU训练
 
 ## 2 文件结构：
@@ -39,8 +39,8 @@
 ```
 
 ## 3 训练数据的准备以及目录结构
-* 这里建议标注数据时直接生成yolo格式的标签文件```.txt```，推荐使用免费开源的标注软件(支持yolo格式)，[https://github.com/tzutalin/labelImg](https://github.com/tzutalin/labelImg)
-* 如果之前已经标注成pascal voc的```.xml```格式了也没关系，我写了个voc转yolo格式的转化脚本，4.1会讲怎么使用
+* 这里建议标注数据时直接生成yolo格式的标签文件`.txt`，推荐使用免费开源的标注软件(支持yolo格式)，[https://github.com/tzutalin/labelImg](https://github.com/tzutalin/labelImg)
+* 如果之前已经标注成pascal voc的`.xml`格式了也没关系，我写了个voc转yolo格式的转化脚本，4.1会讲怎么使用
 * 测试图像时最好将图像缩放到32的倍数
 * 标注好的数据集请按照以下目录结构进行摆放:
 ```
@@ -58,12 +58,12 @@
 ├── data 利用数据集生成的一系列相关准备文件目录
 │    ├── my_train_data.txt:  该文件里存储的是所有训练图片的路径地址
 │    ├── my_val_data.txt:  该文件里存储的是所有验证图片的路径地址
-│    ├── my_data_label.names:  该文件里存储的是所有类别的名称，一个类别对应一行(这里会根据```.json```文件自动生成)
+│    ├── my_data_label.names:  该文件里存储的是所有类别的名称，一个类别对应一行(这里会根据`.json`文件自动生成)
 │    └── my_data.data:  该文件里记录的是类别数类别信息、train以及valid对应的txt文件
 ```
 
 ### 4.1 将VOC标注数据转为YOLO标注数据(如果你的数据已经是YOLO格式了，可跳过该步骤)
-* 使用```trans_voc2yolo.py```脚本进行转换，并在```./data/```文件夹下生成```my_data_label.names```标签文件，
+* 使用`trans_voc2yolo.py`脚本进行转换，并在`./data/`文件夹下生成`my_data_label.names`标签文件，
 * 执行脚本前，需要根据自己的路径修改以下参数
 ```python
 # voc数据集根目录以及版本
@@ -80,7 +80,7 @@ save_file_root = "/home/wz/my_project/my_yolo_dataset"
 # label标签对应json文件
 label_json_path = './data/pascal_voc_classes.json'
 ```
-* 生成的```my_data_label.names```标签文件格式如下
+* 生成的`my_data_label.names`标签文件格式如下
 ```text
 aeroplane
 bicycle
@@ -92,7 +92,7 @@ bus
 ```
 
 ### 4.2 根据摆放好的数据集信息生成一系列相关准备文件
-* 使用```calculate_dataset.py```脚本生成```my_train_data.txt```文件、```my_val_data.txt```文件以及```my_data.data```文件，并生成新的```my_yolov3.cfg```文件
+* 使用`calculate_dataset.py`脚本生成`my_train_data.txt`文件、`my_val_data.txt`文件以及`my_data.data`文件，并生成新的`my_yolov3.cfg`文件
 * 执行脚本前，需要根据自己的路径修改以下参数
 ```python
 # 训练集的labels目录路径
@@ -106,21 +106,22 @@ cfg_path = "./cfg/yolov3-spp.cfg"
 ```
 
 ## 5 预训练权重下载地址（下载后放入weights文件夹中）：
-* ```yolov3-spp-ultralytics-416.pt```: 链接: https://pan.baidu.com/s/1cK3USHKxDx-d5dONij52lA  密码: r3vm
-* ```yolov3-spp-ultralytics-512.pt```: 链接: https://pan.baidu.com/s/1k5yeTZZNv8Xqf0uBXnUK-g  密码: e3k1
-* ```yolov3-spp-ultralytics-608.pt```: 链接: https://pan.baidu.com/s/1GI8BA0wxeWMC0cjrC01G7Q  密码: ma3t
-* ```yolov3spp-voc-512.pt``` **(这是我在视频演示训练中得到的权重)**: 链接: https://pan.baidu.com/s/1aFAtaHlge0ieFtQ9nhmj3w  密码: 8ph3
+* `yolov3-spp-ultralytics-416.pt`: 链接: https://pan.baidu.com/s/1cK3USHKxDx-d5dONij52lA  密码: r3vm
+* `yolov3-spp-ultralytics-512.pt`: 链接: https://pan.baidu.com/s/1k5yeTZZNv8Xqf0uBXnUK-g  密码: e3k1
+* `yolov3-spp-ultralytics-608.pt`: 链接: https://pan.baidu.com/s/1GI8BA0wxeWMC0cjrC01G7Q  密码: ma3t
+* `yolov3spp-voc-512.pt` **(这是我在视频演示训练中得到的权重)**: 链接: https://pan.baidu.com/s/1aFAtaHlge0ieFtQ9nhmj3w  密码: 8ph3
  
  
 ## 6 数据集，本例程使用的是PASCAL VOC2012数据集
-* ```Pascal VOC2012``` train/val数据集下载地址：http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+* `Pascal VOC2012` train/val数据集下载地址：http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
 * 如果不了解数据集或者想使用自己的数据集进行训练，请参考我的bilibili：https://b23.tv/F1kSCK
 
 ## 7 使用方法
 * 确保提前准备好数据集
 * 确保提前下载好对应预训练模型权重
 * 若要使用单GPU训练或者使用CPU训练，直接使用train.py训练脚本
-* 若要使用多GPU训练，使用```python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py```指令,```nproc_per_node```参数为使用GPU数量
+* 若要使用多GPU训练，使用`python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py`指令,`nproc_per_node`参数为使用GPU数量
+* 训练过程中保存的`results.txt`是每个epoch在验证集上的COCO指标，前12个值是COCO指标，后面两个值是训练平均损失以及学习率
 
 ## 如果对YOLOv3 SPP网络原理不是很理解可参考我的bilibili
 [https://www.bilibili.com/video/BV1yi4y1g7ro?p=3](https://www.bilibili.com/video/BV1yi4y1g7ro?p=3)
diff --git a/pytorch_object_detection/yolov3_spp/build_utils/img_utils.py b/pytorch_object_detection/yolov3_spp/build_utils/img_utils.py
index fc4c71929..cabc6c9b3 100644
--- a/pytorch_object_detection/yolov3_spp/build_utils/img_utils.py
+++ b/pytorch_object_detection/yolov3_spp/build_utils/img_utils.py
@@ -37,8 +37,8 @@ def letterbox(img: np.ndarray,
         dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
     elif scale_fill:  # stretch 简单粗暴的将图片缩放到指定尺寸
         dw, dh = 0, 0
-        new_unpad = new_shape
-        ratio = new_shape[0] / shape[1], new_shape[1] / shape[0]  # wh ratios
+        new_unpad = new_shape[::-1]  # [h, w] -> [w, h]
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # wh ratios
 
     dw /= 2  # divide padding into 2 sides 将padding分到上下，左右两侧
     dh /= 2
diff --git a/pytorch_object_detection/yolov3_spp/build_utils/utils.py b/pytorch_object_detection/yolov3_spp/build_utils/utils.py
index 2c6f73a6d..bf08ea70f 100755
--- a/pytorch_object_detection/yolov3_spp/build_utils/utils.py
+++ b/pytorch_object_detection/yolov3_spp/build_utils/utils.py
@@ -273,7 +273,7 @@ def build_targets(p, targets, model):
     # Build targets for compute_loss(), input targets(image_idx,class,x,y,w,h)
     nt = targets.shape[0]
     tcls, tbox, indices, anch = [], [], [], []
-    gain = torch.ones(6, device=targets.device)  # normalized to gridspace gain
+    gain = torch.ones(6, device=targets.device).long()  # normalized to gridspace gain
 
     multi_gpu = type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
     for i, j in enumerate(model.yolo_layers):  # j: [89, 101, 113]
diff --git a/pytorch_object_detection/yolov3_spp/draw_box_utils.py b/pytorch_object_detection/yolov3_spp/draw_box_utils.py
index 4d545148e..835d7f7c1 100644
--- a/pytorch_object_detection/yolov3_spp/draw_box_utils.py
+++ b/pytorch_object_detection/yolov3_spp/draw_box_utils.py
@@ -1,7 +1,7 @@
-import collections
-from PIL import Image
+from PIL.Image import Image, fromarray
 import PIL.ImageDraw as ImageDraw
 import PIL.ImageFont as ImageFont
+from PIL import ImageColor
 import numpy as np
 
 STANDARD_COLORS = [
@@ -31,69 +31,123 @@
 ]
 
 
-def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):
-    for i in range(boxes.shape[0]):
-        if scores[i] > thresh:
-            box = tuple(boxes[i].tolist())  # numpy -> list -> tuple
-            if classes[i] in category_index.keys():
-                class_name = category_index[classes[i]]
-            else:
-                class_name = 'N/A'
-            display_str = str(class_name)
-            display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))
-            box_to_display_str_map[box].append(display_str)
-            box_to_color_map[box] = STANDARD_COLORS[
-                classes[i] % len(STANDARD_COLORS)]
-        else:
-            break  # 网络输出概率已经排序过，当遇到一个不满足后面的肯定不满足
-
-
-def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):
+def draw_text(draw,
+              box: list,
+              cls: int,
+              score: float,
+              category_index: dict,
+              color: str,
+              font: str = 'arial.ttf',
+              font_size: int = 24):
+    """
+    将目标边界框和类别信息绘制到图片上
+    """
     try:
-        font = ImageFont.truetype('arial.ttf', 20)
+        font = ImageFont.truetype(font, font_size)
     except IOError:
         font = ImageFont.load_default()
 
+    left, top, right, bottom = box
     # If the total height of the display strings added to the top of the bounding
     # box exceeds the top of the image, stack the strings below the bounding box
     # instead of above.
-    display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]
+    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
+    display_str_heights = [font.getsize(ds)[1] for ds in display_str]
     # Each display_str has a top and bottom margin of 0.05x.
-    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
+    display_str_height = (1 + 2 * 0.05) * max(display_str_heights)
 
-    if top > total_display_str_height:
+    if top > display_str_height:
+        text_top = top - display_str_height
         text_bottom = top
     else:
-        text_bottom = bottom + total_display_str_height
-    # Reverse list and print from bottom to top.
-    for display_str in box_to_display_str_map[box][::-1]:
-        text_width, text_height = font.getsize(display_str)
-        margin = np.ceil(0.05 * text_height)
-        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
-                        (left + text_width, text_bottom)], fill=color)
-        draw.text((left + margin, text_bottom - text_height - margin),
-                  display_str,
+        text_top = bottom
+        text_bottom = bottom + display_str_height
+
+    for ds in display_str:
+        text_width, text_height = font.getsize(ds)
+        margin = np.ceil(0.05 * text_width)
+        draw.rectangle([(left, text_top),
+                        (left + text_width + 2 * margin, text_bottom)], fill=color)
+        draw.text((left + margin, text_top),
+                  ds,
                   fill='black',
                   font=font)
-        text_bottom -= text_height - 2 * margin
-
-
-def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):
-    box_to_display_str_map = collections.defaultdict(list)
-    box_to_color_map = collections.defaultdict(str)
-
-    filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
-
-    # Draw all boxes onto image.
-    if isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-    draw = ImageDraw.Draw(image)
-    im_width, im_height = image.size
-    for box, color in box_to_color_map.items():
-        xmin, ymin, xmax, ymax = box
-        (left, right, top, bottom) = (xmin * 1, xmax * 1,
-                                      ymin * 1, ymax * 1)
-        draw.line([(left, top), (left, bottom), (right, bottom),
-                   (right, top), (left, top)], width=line_thickness, fill=color)
-        draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
+        left += text_width
+
+
+def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
+    np_image = np.array(image)
+    masks = np.where(masks > thresh, True, False)
+
+    # colors = np.array(colors)
+    img_to_draw = np.copy(np_image)
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors):
+        img_to_draw[mask] = color
+
+    out = np_image * (1 - alpha) + img_to_draw * alpha
+    return fromarray(out.astype(np.uint8))
+
+
+def draw_objs(image: Image,
+              boxes: np.ndarray = None,
+              classes: np.ndarray = None,
+              scores: np.ndarray = None,
+              masks: np.ndarray = None,
+              category_index: dict = None,
+              box_thresh: float = 0.1,
+              mask_thresh: float = 0.5,
+              line_thickness: int = 8,
+              font: str = 'arial.ttf',
+              font_size: int = 24,
+              draw_boxes_on_image: bool = True,
+              draw_masks_on_image: bool = False):
+    """
+    将目标边界框信息，类别信息，mask信息绘制在图片上
+    Args:
+        image: 需要绘制的图片
+        boxes: 目标边界框信息
+        classes: 目标类别信息
+        scores: 目标概率信息
+        masks: 目标mask信息
+        category_index: 类别与名称字典
+        box_thresh: 过滤的概率阈值
+        mask_thresh:
+        line_thickness: 边界框宽度
+        font: 字体类型
+        font_size: 字体大小
+        draw_boxes_on_image:
+        draw_masks_on_image:
+
+    Returns:
+
+    """
+
+    # 过滤掉低概率的目标
+    idxs = np.greater(scores, box_thresh)
+    boxes = boxes[idxs]
+    classes = classes[idxs]
+    scores = scores[idxs]
+    if masks is not None:
+        masks = masks[idxs]
+    if len(boxes) == 0:
+        return image
+
+    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
+
+    if draw_boxes_on_image:
+        # Draw all boxes onto image.
+        draw = ImageDraw.Draw(image)
+        for box, cls, score, color in zip(boxes, classes, scores, colors):
+            left, top, right, bottom = box
+            # 绘制目标边界框
+            draw.line([(left, top), (left, bottom), (right, bottom),
+                       (right, top), (left, top)], width=line_thickness, fill=color)
+            # 绘制类别和概率信息
+            draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
+
+    if draw_masks_on_image and (masks is not None):
+        # Draw all mask onto image.
+        image = draw_masks(image, masks, colors, mask_thresh)
+
     return image
diff --git a/pytorch_object_detection/yolov3_spp/load_onnx_test.py b/pytorch_object_detection/yolov3_spp/load_onnx_test.py
index 1ac3dcbd5..de33fc3dd 100644
--- a/pytorch_object_detection/yolov3_spp/load_onnx_test.py
+++ b/pytorch_object_detection/yolov3_spp/load_onnx_test.py
@@ -150,7 +150,7 @@ def nms(bboxes: np.ndarray, iou_threshold=0.5, soft_threshold=0.3, sigma=0.5, me
 
         bboxes = bboxes[iou_mask]
 
-    return np.array(best_bboxes_index, dtype=np.int8)
+    return np.array(best_bboxes_index, dtype=np.int32)
 
 
 def post_process(pred: np.ndarray, multi_label=False, conf_thres=0.3):
diff --git a/pytorch_object_detection/yolov3_spp/predict_test.py b/pytorch_object_detection/yolov3_spp/predict_test.py
index 67dd40b4f..bbd2d87b4 100644
--- a/pytorch_object_detection/yolov3_spp/predict_test.py
+++ b/pytorch_object_detection/yolov3_spp/predict_test.py
@@ -6,16 +6,17 @@
 import cv2
 import numpy as np
 from matplotlib import pyplot as plt
+from PIL import Image
 
 from build_utils import img_utils, torch_utils, utils
 from models import Darknet
-from draw_box_utils import draw_box
+from draw_box_utils import draw_objs
 
 
 def main():
     img_size = 512  # 必须是32的整数倍 [416, 512, 608]
     cfg = "cfg/my_yolov3.cfg"  # 改成生成的.cfg文件
-    weights = "weights/yolov3spp-voc-512.pt"  # 改成自己训练好的权重文件
+    weights_path = "weights/yolov3spp-voc-512.pt"  # 改成自己训练好的权重文件
     json_path = "./data/pascal_voc_classes.json"  # json标签文件
     img_path = "test.jpg"
     assert os.path.exists(cfg), "cfg file {} dose not exist.".format(cfg)
@@ -23,17 +24,19 @@ def main():
     assert os.path.exists(json_path), "json file {} dose not exist.".format(json_path)
     assert os.path.exists(img_path), "image file {} dose not exist.".format(img_path)
 
-    json_file = open(json_path, 'r')
-    class_dict = json.load(json_file)
-    json_file.close()
-    category_index = {v: k for k, v in class_dict.items()}
+    with open(json_path, 'r') as f:
+        class_dict = json.load(f)
+
+    category_index = {str(v): str(k) for k, v in class_dict.items()}
 
     input_size = (img_size, img_size)
 
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
     model = Darknet(cfg, img_size)
-    model.load_state_dict(torch.load(weights, map_location=device)["model"])
+    weights_dict = torch.load(weights_path, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     model.to(device)
 
     model.eval()
@@ -75,11 +78,20 @@ def main():
         scores = pred[:, 4].detach().cpu().numpy()
         classes = pred[:, 5].detach().cpu().numpy().astype(np.int) + 1
 
-        img_o = draw_box(img_o[:, :, ::-1], bboxes, classes, scores, category_index)
-        plt.imshow(img_o)
+        pil_img = Image.fromarray(img_o[:, :, ::-1])
+        plot_img = draw_objs(pil_img,
+                             bboxes,
+                             classes,
+                             scores,
+                             category_index=category_index,
+                             box_thresh=0.2,
+                             line_thickness=3,
+                             font='arial.ttf',
+                             font_size=20)
+        plt.imshow(plot_img)
         plt.show()
-
-        img_o.save("test_result.jpg")
+        # 保存预测的图片结果
+        plot_img.save("test_result.jpg")
 
 
 if __name__ == "__main__":
diff --git a/pytorch_object_detection/yolov3_spp/validation.py b/pytorch_object_detection/yolov3_spp/validation.py
index 34737cbae..074b3c839 100644
--- a/pytorch_object_detection/yolov3_spp/validation.py
+++ b/pytorch_object_detection/yolov3_spp/validation.py
@@ -89,9 +89,9 @@ def main(parser_data):
     # read class_indict
     label_json_path = './data/pascal_voc_classes.json'
     assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
-    json_file = open(label_json_path, 'r')
-    class_dict = json.load(json_file)
-    json_file.close()
+    with open(label_json_path, 'r') as f:
+        class_dict = json.load(f)
+
     category_index = {v: k for k, v in class_dict.items()}
 
     data_dict = parse_data_cfg(parser_data.data)
@@ -116,7 +116,9 @@ def main(parser_data):
 
     # create model
     model = Darknet(parser_data.cfg, parser_data.img_size)
-    model.load_state_dict(torch.load(parser_data.weights, map_location=device)["model"])
+    weights_dict = torch.load(parser_data.weights, map_location='cpu')
+    weights_dict = weights_dict["model"] if "model" in weights_dict else weights_dict
+    model.load_state_dict(weights_dict)
     model.to(device)
 
     # evaluate on the test dataset
diff --git a/pytorch_segmentation/deeplab_v3/predict.py b/pytorch_segmentation/deeplab_v3/predict.py
index 98fc5a4db..1e14eb3c7 100644
--- a/pytorch_segmentation/deeplab_v3/predict.py
+++ b/pytorch_segmentation/deeplab_v3/predict.py
@@ -69,7 +69,7 @@ def main():
         t_start = time_synchronized()
         output = model(img.to(device))
         t_end = time_synchronized()
-        print("inference+NMS time: {}".format(t_end - t_start))
+        print("inference time: {}".format(t_end - t_start))
 
         prediction = output['out'].argmax(1).squeeze(0)
         prediction = prediction.to("cpu").numpy().astype(np.uint8)
diff --git a/pytorch_segmentation/deeplab_v3/requirements.txt b/pytorch_segmentation/deeplab_v3/requirements.txt
index 50b913cfc..ede3e2584 100644
--- a/pytorch_segmentation/deeplab_v3/requirements.txt
+++ b/pytorch_segmentation/deeplab_v3/requirements.txt
@@ -1,4 +1,4 @@
-numpy==1.21.3
+numpy==1.22.0
 torch==1.10.0
 torchvision==0.11.1
-Pillow==8.4.0
\ No newline at end of file
+Pillow
diff --git a/pytorch_segmentation/fcn/predict.py b/pytorch_segmentation/fcn/predict.py
index c9db2865a..f25222e58 100644
--- a/pytorch_segmentation/fcn/predict.py
+++ b/pytorch_segmentation/fcn/predict.py
@@ -69,7 +69,7 @@ def main():
         t_start = time_synchronized()
         output = model(img.to(device))
         t_end = time_synchronized()
-        print("inference+NMS time: {}".format(t_end - t_start))
+        print("inference time: {}".format(t_end - t_start))
 
         prediction = output['out'].argmax(1).squeeze(0)
         prediction = prediction.to("cpu").numpy().astype(np.uint8)
diff --git a/pytorch_segmentation/fcn/requirements.txt b/pytorch_segmentation/fcn/requirements.txt
index 50b913cfc..2c58f889e 100644
--- a/pytorch_segmentation/fcn/requirements.txt
+++ b/pytorch_segmentation/fcn/requirements.txt
@@ -1,4 +1,4 @@
-numpy==1.21.3
-torch==1.10.0
+numpy==1.22.0
+torch==1.13.1
 torchvision==0.11.1
-Pillow==8.4.0
\ No newline at end of file
+Pillow
diff --git a/pytorch_segmentation/lraspp/predict.py b/pytorch_segmentation/lraspp/predict.py
index 11ebfd7b3..27963fbc3 100644
--- a/pytorch_segmentation/lraspp/predict.py
+++ b/pytorch_segmentation/lraspp/predict.py
@@ -63,7 +63,7 @@ def main():
         t_start = time_synchronized()
         output = model(img.to(device))
         t_end = time_synchronized()
-        print("inference+NMS time: {}".format(t_end - t_start))
+        print("inference time: {}".format(t_end - t_start))
 
         prediction = output['out'].argmax(1).squeeze(0)
         prediction = prediction.to("cpu").numpy().astype(np.uint8)
diff --git a/pytorch_segmentation/lraspp/requirements.txt b/pytorch_segmentation/lraspp/requirements.txt
index 50b913cfc..ede3e2584 100644
--- a/pytorch_segmentation/lraspp/requirements.txt
+++ b/pytorch_segmentation/lraspp/requirements.txt
@@ -1,4 +1,4 @@
-numpy==1.21.3
+numpy==1.22.0
 torch==1.10.0
 torchvision==0.11.1
-Pillow==8.4.0
\ No newline at end of file
+Pillow
diff --git a/pytorch_segmentation/u2net/README.md b/pytorch_segmentation/u2net/README.md
new file mode 100644
index 000000000..aa0ae013e
--- /dev/null
+++ b/pytorch_segmentation/u2net/README.md
@@ -0,0 +1,90 @@
+# U2-Net(Going Deeper with Nested U-Structure for Salient Object Detection)
+
+## 该项目主要是来自官方的源码
+- https://github.com/xuebinqin/U-2-Net
+- 注意，该项目是针对显著性目标检测领域（Salient Object Detection / SOD）
+
+## 环境配置：
+- Python3.6/3.7/3.8
+- Pytorch1.10
+- Ubuntu或Centos(Windows暂不支持多GPU训练)
+- 建议使用GPU训练
+- 详细环境配置见`requirements.txt`
+
+
+## 文件结构
+```
+├── src: 搭建网络相关代码
+├── train_utils: 训练以及验证相关代码
+├── my_dataset.py: 自定义数据集读取相关代码
+├── predict.py: 简易的预测代码
+├── train.py: 单GPU或CPU训练代码
+├── train_multi_GPU.py: 多GPU并行训练代码
+├── validation.py: 单独验证模型相关代码
+├── transforms.py: 数据预处理相关代码
+└── requirements.txt: 项目依赖
+```
+
+## DUTS数据集准备
+- DUTS数据集官方下载地址：[http://saliencydetection.net/duts/](http://saliencydetection.net/duts/)
+- 如果下载不了，可以通过我提供的百度云下载，链接: https://pan.baidu.com/s/1nBI6GTN0ZilqH4Tvu18dow  密码: r7k6
+- 其中DUTS-TR为训练集，DUTS-TE是测试（验证）集，数据集解压后目录结构如下：
+```
+├── DUTS-TR
+│      ├── DUTS-TR-Image: 该文件夹存放所有训练集的图片
+│      └── DUTS-TR-Mask: 该文件夹存放对应训练图片的GT标签（Mask蒙板形式）
+│
+└── DUTS-TE
+       ├── DUTS-TE-Image: 该文件夹存放所有测试（验证）集的图片
+       └── DUTS-TE-Mask: 该文件夹存放对应测试（验证）图片的GT标签（Mask蒙板形式）
+```
+- 注意训练或者验证过程中，将`--data-path`指向`DUTS-TR`所在根目录
+
+## 官方权重
+从官方转换得到的权重：
+- `u2net_full.pth`下载链接: https://pan.baidu.com/s/1ojJZS8v3F_eFKkF3DEdEXA  密码: fh1v
+- `u2net_lite.pth`下载链接: https://pan.baidu.com/s/1TIWoiuEz9qRvTX9quDqQHg  密码: 5stj
+
+`u2net_full`在DUTS-TE上的验证结果(使用`validation.py`进行验证)：
+```
+MAE: 0.044
+maxF1: 0.868
+```
+**注：**
+- 这里的maxF1和原论文中的结果有些差异，经过对比发现差异主要来自post_norm，原仓库中会对预测结果进行post_norm，但在本仓库中将post_norm给移除了。
+如果加上post_norm这里的maxF1为`0.872`，如果需要做该后处理可自行添加，post_norm流程如下，其中output为验证时网络预测的输出：
+```python
+ma = torch.max(output)
+mi = torch.min(output)
+output = (output - mi) / (ma - mi)
+```
+- 如果要载入官方提供的权重，需要将`src/model.py`中`ConvBNReLU`类里卷积的bias设置成True，因为官方代码里没有进行设置（Conv2d的bias默认为True）。
+因为卷积后跟了BN，所以bias是起不到作用的，所以在本仓库中默认将bias设置为False。
+
+## 训练记录(`u2net_full`)
+训练指令：
+```
+torchrun --nproc_per_node=4 train_multi_GPU.py --lr 0.004 --amp
+```
+训练最终在DUTS-TE上的验证结果：
+```
+MAE: 0.047
+maxF1: 0.859
+```
+训练过程详情可见results.txt文件，训练权重下载链接: https://pan.baidu.com/s/1df2jMkrjbgEv-r1NMaZCZg  密码: n4l6
+
+## 训练方法
+* 确保提前准备好数据集
+* 若要使用单GPU或者CPU训练，直接使用train.py训练脚本
+* 若要使用多GPU训练，使用`torchrun --nproc_per_node=8 train_multi_GPU.py`指令,`nproc_per_node`参数为使用GPU数量
+* 如果想指定使用哪些GPU设备可在指令前加上`CUDA_VISIBLE_DEVICES=0,3`(例如我只要使用设备中的第1块和第4块GPU设备)
+* `CUDA_VISIBLE_DEVICES=0,3 torchrun --nproc_per_node=2 train_multi_GPU.py`
+
+## 如果对U2Net网络不了解的可参考我的bilibili
+- [https://www.bilibili.com/video/BV1yB4y1z7m](https://www.bilibili.com/video/BV1yB4y1z7m)
+
+## 进一步了解该项目，以及对U2Net代码的分析可参考我的bilibili
+- [https://www.bilibili.com/video/BV1Kt4y137iS](https://www.bilibili.com/video/BV1Kt4y137iS)
+
+## U2NET网络结构
+![u2net](./u2net.png)
\ No newline at end of file
diff --git a/pytorch_segmentation/u2net/convert_weight.py b/pytorch_segmentation/u2net/convert_weight.py
new file mode 100644
index 000000000..df447e72b
--- /dev/null
+++ b/pytorch_segmentation/u2net/convert_weight.py
@@ -0,0 +1,140 @@
+import re
+import torch
+from src import u2net_full, u2net_lite
+
+layers = {"encode": [7, 6, 5, 4, 4, 4],
+          "decode": [4, 4, 5, 6, 7]}
+
+
+def convert_conv_bn(new_weight, prefix, ks, v):
+    if "conv" in ks[0]:
+        if "weight" == ks[1]:
+            new_weight[prefix + ".conv.weight"] = v
+        elif "bias" == ks[1]:
+            new_weight[prefix + ".conv.bias"] = v
+        else:
+            print(f"unrecognized weight {prefix + ks[1]}")
+        return
+
+    if "bn" in ks[0]:
+        if "running_mean" == ks[1]:
+            new_weight[prefix + ".bn.running_mean"] = v
+        elif "running_var" == ks[1]:
+            new_weight[prefix + ".bn.running_var"] = v
+        elif "weight" == ks[1]:
+            new_weight[prefix + ".bn.weight"] = v
+        elif "bias" == ks[1]:
+            new_weight[prefix + ".bn.bias"] = v
+        elif "num_batches_tracked" == ks[1]:
+            return
+        else:
+            print(f"unrecognized weight {prefix + ks[1]}")
+        return
+
+
+def convert(old_weight: dict):
+    new_weight = {}
+    for k, v in old_weight.items():
+        ks = k.split(".")
+        if ("stage" in ks[0]) and ("d" not in ks[0]):
+            # encode stage
+            num = int(re.findall(r'\d', ks[0])[0]) - 1
+            prefix = f"encode_modules.{num}"
+            if "rebnconvin" == ks[1]:
+                # ConvBNReLU module
+                prefix += ".conv_in"
+                convert_conv_bn(new_weight, prefix, ks[2:], v)
+            elif ("rebnconv" in ks[1]) and ("d" not in ks[1]):
+                num_ = int(re.findall(r'\d', ks[1])[0]) - 1
+                prefix += f".encode_modules.{num_}"
+                convert_conv_bn(new_weight, prefix, ks[2:], v)
+            elif ("rebnconv" in ks[1]) and ("d" in ks[1]):
+                num_ = layers["encode"][num] - int(re.findall(r'\d', ks[1])[0]) - 1
+                prefix += f".decode_modules.{num_}"
+                convert_conv_bn(new_weight, prefix, ks[2:], v)
+            else:
+                print(f"unrecognized key: {k}")
+
+        elif ("stage" in ks[0]) and ("d" in ks[0]):
+            # decode stage
+            num = 5 - int(re.findall(r'\d', ks[0])[0])
+            prefix = f"decode_modules.{num}"
+            if "rebnconvin" == ks[1]:
+                # ConvBNReLU module
+                prefix += ".conv_in"
+                convert_conv_bn(new_weight, prefix, ks[2:], v)
+            elif ("rebnconv" in ks[1]) and ("d" not in ks[1]):
+                num_ = int(re.findall(r'\d', ks[1])[0]) - 1
+                prefix += f".encode_modules.{num_}"
+                convert_conv_bn(new_weight, prefix, ks[2:], v)
+            elif ("rebnconv" in ks[1]) and ("d" in ks[1]):
+                num_ = layers["decode"][num] - int(re.findall(r'\d', ks[1])[0]) - 1
+                prefix += f".decode_modules.{num_}"
+                convert_conv_bn(new_weight, prefix, ks[2:], v)
+            else:
+                print(f"unrecognized key: {k}")
+        elif "side" in ks[0]:
+            # side
+            num = 6 - int(re.findall(r'\d', ks[0])[0])
+            prefix = f"side_modules.{num}"
+            if "weight" == ks[1]:
+                new_weight[prefix + ".weight"] = v
+            elif "bias" == ks[1]:
+                new_weight[prefix + ".bias"] = v
+            else:
+                print(f"unrecognized weight {prefix + ks[1]}")
+        elif "outconv" in ks[0]:
+            prefix = f"out_conv"
+            if "weight" == ks[1]:
+                new_weight[prefix + ".weight"] = v
+            elif "bias" == ks[1]:
+                new_weight[prefix + ".bias"] = v
+            else:
+                print(f"unrecognized weight {prefix + ks[1]}")
+        else:
+            print(f"unrecognized key: {k}")
+
+    return new_weight
+
+
+def main_1():
+    from u2net import U2NET, U2NETP
+
+    old_m = U2NET()
+    old_m.load_state_dict(torch.load("u2net.pth", map_location='cpu'))
+    new_m = u2net_full()
+
+    # old_m = U2NETP()
+    # old_m.load_state_dict(torch.load("u2netp.pth", map_location='cpu'))
+    # new_m = u2net_lite()
+
+    old_w = old_m.state_dict()
+
+    w = convert(old_w)
+    new_m.load_state_dict(w, strict=True)
+
+    torch.random.manual_seed(0)
+    x = torch.randn(1, 3, 288, 288)
+    old_m.eval()
+    new_m.eval()
+    with torch.no_grad():
+        out1 = old_m(x)[0]
+        out2 = new_m(x)
+        assert torch.equal(out1, out2)
+        torch.save(new_m.state_dict(), "u2net_full.pth")
+
+
+def main():
+    old_w = torch.load("u2net.pth", map_location='cpu')
+    new_m = u2net_full()
+
+    # old_w = torch.load("u2netp.pth", map_location='cpu')
+    # new_m = u2net_lite()
+
+    w = convert(old_w)
+    new_m.load_state_dict(w, strict=True)
+    torch.save(new_m.state_dict(), "u2net_full.pth")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pytorch_segmentation/u2net/my_dataset.py b/pytorch_segmentation/u2net/my_dataset.py
new file mode 100644
index 000000000..6c993db5d
--- /dev/null
+++ b/pytorch_segmentation/u2net/my_dataset.py
@@ -0,0 +1,80 @@
+import os
+
+import cv2
+import torch.utils.data as data
+
+
+class DUTSDataset(data.Dataset):
+    def __init__(self, root: str, train: bool = True, transforms=None):
+        assert os.path.exists(root), f"path '{root}' does not exist."
+        if train:
+            self.image_root = os.path.join(root, "DUTS-TR", "DUTS-TR-Image")
+            self.mask_root = os.path.join(root, "DUTS-TR", "DUTS-TR-Mask")
+        else:
+            self.image_root = os.path.join(root, "DUTS-TE", "DUTS-TE-Image")
+            self.mask_root = os.path.join(root, "DUTS-TE", "DUTS-TE-Mask")
+        assert os.path.exists(self.image_root), f"path '{self.image_root}' does not exist."
+        assert os.path.exists(self.mask_root), f"path '{self.mask_root}' does not exist."
+
+        image_names = [p for p in os.listdir(self.image_root) if p.endswith(".jpg")]
+        mask_names = [p for p in os.listdir(self.mask_root) if p.endswith(".png")]
+        assert len(image_names) > 0, f"not find any images in {self.image_root}."
+
+        # check images and mask
+        re_mask_names = []
+        for p in image_names:
+            mask_name = p.replace(".jpg", ".png")
+            assert mask_name in mask_names, f"{p} has no corresponding mask."
+            re_mask_names.append(mask_name)
+        mask_names = re_mask_names
+
+        self.images_path = [os.path.join(self.image_root, n) for n in image_names]
+        self.masks_path = [os.path.join(self.mask_root, n) for n in mask_names]
+
+        self.transforms = transforms
+
+    def __getitem__(self, idx):
+        image_path = self.images_path[idx]
+        mask_path = self.masks_path[idx]
+        image = cv2.imread(image_path, flags=cv2.IMREAD_COLOR)
+        assert image is not None, f"failed to read image: {image_path}"
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # BGR -> RGB
+        h, w, _ = image.shape
+
+        target = cv2.imread(mask_path, flags=cv2.IMREAD_GRAYSCALE)
+        assert target is not None, f"failed to read mask: {mask_path}"
+
+        if self.transforms is not None:
+            image, target = self.transforms(image, target)
+
+        return image, target
+
+    def __len__(self):
+        return len(self.images_path)
+
+    @staticmethod
+    def collate_fn(batch):
+        images, targets = list(zip(*batch))
+        batched_imgs = cat_list(images, fill_value=0)
+        batched_targets = cat_list(targets, fill_value=0)
+
+        return batched_imgs, batched_targets
+
+
+def cat_list(images, fill_value=0):
+    max_size = tuple(max(s) for s in zip(*[img.shape for img in images]))
+    batch_shape = (len(images),) + max_size
+    batched_imgs = images[0].new(*batch_shape).fill_(fill_value)
+    for img, pad_img in zip(images, batched_imgs):
+        pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img)
+    return batched_imgs
+
+
+if __name__ == '__main__':
+    train_dataset = DUTSDataset("./", train=True)
+    print(len(train_dataset))
+
+    val_dataset = DUTSDataset("./", train=False)
+    print(len(val_dataset))
+
+    i, t = train_dataset[0]
diff --git a/pytorch_segmentation/u2net/predict.py b/pytorch_segmentation/u2net/predict.py
new file mode 100644
index 000000000..26b2d257a
--- /dev/null
+++ b/pytorch_segmentation/u2net/predict.py
@@ -0,0 +1,71 @@
+import os
+import time
+
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+from torchvision.transforms import transforms
+
+from src import u2net_full
+
+
+def time_synchronized():
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    return time.time()
+
+
+def main():
+    weights_path = "./u2net_full.pth"
+    img_path = "./test.png"
+    threshold = 0.5
+
+    assert os.path.exists(img_path), f"image file {img_path} dose not exists."
+
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    data_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize(320),
+        transforms.Normalize(mean=(0.485, 0.456, 0.406),
+                             std=(0.229, 0.224, 0.225))
+    ])
+
+    origin_img = cv2.cvtColor(cv2.imread(img_path, flags=cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+
+    h, w = origin_img.shape[:2]
+    img = data_transform(origin_img)
+    img = torch.unsqueeze(img, 0).to(device)  # [C, H, W] -> [1, C, H, W]
+
+    model = u2net_full()
+    weights = torch.load(weights_path, map_location='cpu')
+    if "model" in weights:
+        model.load_state_dict(weights["model"])
+    else:
+        model.load_state_dict(weights)
+    model.to(device)
+    model.eval()
+
+    with torch.no_grad():
+        # init model
+        img_height, img_width = img.shape[-2:]
+        init_img = torch.zeros((1, 3, img_height, img_width), device=device)
+        model(init_img)
+
+        t_start = time_synchronized()
+        pred = model(img)
+        t_end = time_synchronized()
+        print("inference time: {}".format(t_end - t_start))
+        pred = torch.squeeze(pred).to("cpu").numpy()  # [1, 1, H, W] -> [H, W]
+
+        pred = cv2.resize(pred, dsize=(w, h), interpolation=cv2.INTER_LINEAR)
+        pred_mask = np.where(pred > threshold, 1, 0)
+        origin_img = np.array(origin_img, dtype=np.uint8)
+        seg_img = origin_img * pred_mask[..., None]
+        plt.imshow(seg_img)
+        plt.show()
+        cv2.imwrite("pred_result.png", cv2.cvtColor(seg_img.astype(np.uint8), cv2.COLOR_RGB2BGR))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pytorch_segmentation/u2net/requirements.txt b/pytorch_segmentation/u2net/requirements.txt
new file mode 100644
index 000000000..a47904dd0
--- /dev/null
+++ b/pytorch_segmentation/u2net/requirements.txt
@@ -0,0 +1,4 @@
+numpy==1.22.0
+torch==1.13.1
+torchvision==0.11.1
+opencv_python==4.5.4.60
diff --git a/pytorch_segmentation/u2net/results20220723-123632.txt b/pytorch_segmentation/u2net/results20220723-123632.txt
new file mode 100644
index 000000000..e174d4c94
--- /dev/null
+++ b/pytorch_segmentation/u2net/results20220723-123632.txt
@@ -0,0 +1,37 @@
+[epoch: 0] train_loss: 2.7385 lr: 0.002002 MAE: 0.465 maxF1: 0.464 
+[epoch: 10] train_loss: 1.0385 lr: 0.003994 MAE: 0.124 maxF1: 0.719 
+[epoch: 20] train_loss: 0.7629 lr: 0.003972 MAE: 0.077 maxF1: 0.787 
+[epoch: 30] train_loss: 0.6758 lr: 0.003936 MAE: 0.083 maxF1: 0.791 
+[epoch: 40] train_loss: 0.4905 lr: 0.003884 MAE: 0.073 maxF1: 0.805 
+[epoch: 50] train_loss: 0.4337 lr: 0.003818 MAE: 0.063 maxF1: 0.821 
+[epoch: 60] train_loss: 0.4157 lr: 0.003738 MAE: 0.067 maxF1: 0.818 
+[epoch: 70] train_loss: 0.3424 lr: 0.003644 MAE: 0.058 maxF1: 0.840 
+[epoch: 80] train_loss: 0.2909 lr: 0.003538 MAE: 0.057 maxF1: 0.842 
+[epoch: 90] train_loss: 0.3220 lr: 0.003420 MAE: 0.064 maxF1: 0.837 
+[epoch: 100] train_loss: 0.2653 lr: 0.003292 MAE: 0.055 maxF1: 0.847 
+[epoch: 110] train_loss: 0.2627 lr: 0.003153 MAE: 0.055 maxF1: 0.846 
+[epoch: 120] train_loss: 0.3230 lr: 0.003005 MAE: 0.058 maxF1: 0.837 
+[epoch: 130] train_loss: 0.2177 lr: 0.002850 MAE: 0.053 maxF1: 0.852 
+[epoch: 140] train_loss: 0.2807 lr: 0.002688 MAE: 0.061 maxF1: 0.824 
+[epoch: 150] train_loss: 0.2091 lr: 0.002520 MAE: 0.057 maxF1: 0.846 
+[epoch: 160] train_loss: 0.1971 lr: 0.002349 MAE: 0.049 maxF1: 0.857 
+[epoch: 170] train_loss: 0.2157 lr: 0.002175 MAE: 0.050 maxF1: 0.851 
+[epoch: 180] train_loss: 0.1881 lr: 0.002000 MAE: 0.048 maxF1: 0.857 
+[epoch: 190] train_loss: 0.1855 lr: 0.001825 MAE: 0.047 maxF1: 0.860 
+[epoch: 200] train_loss: 0.1817 lr: 0.001651 MAE: 0.047 maxF1: 0.863 
+[epoch: 210] train_loss: 0.1740 lr: 0.001480 MAE: 0.048 maxF1: 0.858 
+[epoch: 220] train_loss: 0.1707 lr: 0.001312 MAE: 0.048 maxF1: 0.860 
+[epoch: 230] train_loss: 0.1653 lr: 0.001150 MAE: 0.048 maxF1: 0.859 
+[epoch: 240] train_loss: 0.1652 lr: 0.000995 MAE: 0.046 maxF1: 0.860 
+[epoch: 250] train_loss: 0.1631 lr: 0.000847 MAE: 0.048 maxF1: 0.857 
+[epoch: 260] train_loss: 0.1584 lr: 0.000708 MAE: 0.047 maxF1: 0.862 
+[epoch: 270] train_loss: 0.1590 lr: 0.000580 MAE: 0.047 maxF1: 0.860 
+[epoch: 280] train_loss: 0.1521 lr: 0.000462 MAE: 0.047 maxF1: 0.861 
+[epoch: 290] train_loss: 0.1535 lr: 0.000356 MAE: 0.047 maxF1: 0.861 
+[epoch: 300] train_loss: 0.1520 lr: 0.000262 MAE: 0.047 maxF1: 0.860 
+[epoch: 310] train_loss: 0.1488 lr: 0.000182 MAE: 0.047 maxF1: 0.860 
+[epoch: 320] train_loss: 0.1493 lr: 0.000116 MAE: 0.047 maxF1: 0.859 
+[epoch: 330] train_loss: 0.1470 lr: 0.000064 MAE: 0.047 maxF1: 0.860 
+[epoch: 340] train_loss: 0.1493 lr: 0.000028 MAE: 0.047 maxF1: 0.859 
+[epoch: 350] train_loss: 0.1482 lr: 0.000006 MAE: 0.047 maxF1: 0.858 
+[epoch: 359] train_loss: 0.1518 lr: 0.000000 MAE: 0.047 maxF1: 0.859 
diff --git a/pytorch_segmentation/u2net/src/__init__.py b/pytorch_segmentation/u2net/src/__init__.py
new file mode 100644
index 000000000..9411dd2c0
--- /dev/null
+++ b/pytorch_segmentation/u2net/src/__init__.py
@@ -0,0 +1 @@
+from .model import u2net_full, u2net_lite
diff --git a/pytorch_segmentation/u2net/src/model.py b/pytorch_segmentation/u2net/src/model.py
new file mode 100644
index 000000000..9c5b38a25
--- /dev/null
+++ b/pytorch_segmentation/u2net/src/model.py
@@ -0,0 +1,233 @@
+from typing import Union, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_ch: int, out_ch: int, kernel_size: int = 3, dilation: int = 1):
+        super().__init__()
+
+        padding = kernel_size // 2 if dilation == 1 else dilation
+        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, padding=padding, dilation=dilation, bias=False)
+        self.bn = nn.BatchNorm2d(out_ch)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.relu(self.bn(self.conv(x)))
+
+
+class DownConvBNReLU(ConvBNReLU):
+    def __init__(self, in_ch: int, out_ch: int, kernel_size: int = 3, dilation: int = 1, flag: bool = True):
+        super().__init__(in_ch, out_ch, kernel_size, dilation)
+        self.down_flag = flag
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.down_flag:
+            x = F.max_pool2d(x, kernel_size=2, stride=2, ceil_mode=True)
+
+        return self.relu(self.bn(self.conv(x)))
+
+
+class UpConvBNReLU(ConvBNReLU):
+    def __init__(self, in_ch: int, out_ch: int, kernel_size: int = 3, dilation: int = 1, flag: bool = True):
+        super().__init__(in_ch, out_ch, kernel_size, dilation)
+        self.up_flag = flag
+
+    def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        if self.up_flag:
+            x1 = F.interpolate(x1, size=x2.shape[2:], mode='bilinear', align_corners=False)
+        return self.relu(self.bn(self.conv(torch.cat([x1, x2], dim=1))))
+
+
+class RSU(nn.Module):
+    def __init__(self, height: int, in_ch: int, mid_ch: int, out_ch: int):
+        super().__init__()
+
+        assert height >= 2
+        self.conv_in = ConvBNReLU(in_ch, out_ch)
+
+        encode_list = [DownConvBNReLU(out_ch, mid_ch, flag=False)]
+        decode_list = [UpConvBNReLU(mid_ch * 2, mid_ch, flag=False)]
+        for i in range(height - 2):
+            encode_list.append(DownConvBNReLU(mid_ch, mid_ch))
+            decode_list.append(UpConvBNReLU(mid_ch * 2, mid_ch if i < height - 3 else out_ch))
+
+        encode_list.append(ConvBNReLU(mid_ch, mid_ch, dilation=2))
+        self.encode_modules = nn.ModuleList(encode_list)
+        self.decode_modules = nn.ModuleList(decode_list)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_in = self.conv_in(x)
+
+        x = x_in
+        encode_outputs = []
+        for m in self.encode_modules:
+            x = m(x)
+            encode_outputs.append(x)
+
+        x = encode_outputs.pop()
+        for m in self.decode_modules:
+            x2 = encode_outputs.pop()
+            x = m(x, x2)
+
+        return x + x_in
+
+
+class RSU4F(nn.Module):
+    def __init__(self, in_ch: int, mid_ch: int, out_ch: int):
+        super().__init__()
+        self.conv_in = ConvBNReLU(in_ch, out_ch)
+        self.encode_modules = nn.ModuleList([ConvBNReLU(out_ch, mid_ch),
+                                             ConvBNReLU(mid_ch, mid_ch, dilation=2),
+                                             ConvBNReLU(mid_ch, mid_ch, dilation=4),
+                                             ConvBNReLU(mid_ch, mid_ch, dilation=8)])
+
+        self.decode_modules = nn.ModuleList([ConvBNReLU(mid_ch * 2, mid_ch, dilation=4),
+                                             ConvBNReLU(mid_ch * 2, mid_ch, dilation=2),
+                                             ConvBNReLU(mid_ch * 2, out_ch)])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_in = self.conv_in(x)
+
+        x = x_in
+        encode_outputs = []
+        for m in self.encode_modules:
+            x = m(x)
+            encode_outputs.append(x)
+
+        x = encode_outputs.pop()
+        for m in self.decode_modules:
+            x2 = encode_outputs.pop()
+            x = m(torch.cat([x, x2], dim=1))
+
+        return x + x_in
+
+
+class U2Net(nn.Module):
+    def __init__(self, cfg: dict, out_ch: int = 1):
+        super().__init__()
+        assert "encode" in cfg
+        assert "decode" in cfg
+        self.encode_num = len(cfg["encode"])
+
+        encode_list = []
+        side_list = []
+        for c in cfg["encode"]:
+            # c: [height, in_ch, mid_ch, out_ch, RSU4F, side]
+            assert len(c) == 6
+            encode_list.append(RSU(*c[:4]) if c[4] is False else RSU4F(*c[1:4]))
+
+            if c[5] is True:
+                side_list.append(nn.Conv2d(c[3], out_ch, kernel_size=3, padding=1))
+        self.encode_modules = nn.ModuleList(encode_list)
+
+        decode_list = []
+        for c in cfg["decode"]:
+            # c: [height, in_ch, mid_ch, out_ch, RSU4F, side]
+            assert len(c) == 6
+            decode_list.append(RSU(*c[:4]) if c[4] is False else RSU4F(*c[1:4]))
+
+            if c[5] is True:
+                side_list.append(nn.Conv2d(c[3], out_ch, kernel_size=3, padding=1))
+        self.decode_modules = nn.ModuleList(decode_list)
+        self.side_modules = nn.ModuleList(side_list)
+        self.out_conv = nn.Conv2d(self.encode_num * out_ch, out_ch, kernel_size=1)
+
+    def forward(self, x: torch.Tensor) -> Union[torch.Tensor, List[torch.Tensor]]:
+        _, _, h, w = x.shape
+
+        # collect encode outputs
+        encode_outputs = []
+        for i, m in enumerate(self.encode_modules):
+            x = m(x)
+            encode_outputs.append(x)
+            if i != self.encode_num - 1:
+                x = F.max_pool2d(x, kernel_size=2, stride=2, ceil_mode=True)
+
+        # collect decode outputs
+        x = encode_outputs.pop()
+        decode_outputs = [x]
+        for m in self.decode_modules:
+            x2 = encode_outputs.pop()
+            x = F.interpolate(x, size=x2.shape[2:], mode='bilinear', align_corners=False)
+            x = m(torch.concat([x, x2], dim=1))
+            decode_outputs.insert(0, x)
+
+        # collect side outputs
+        side_outputs = []
+        for m in self.side_modules:
+            x = decode_outputs.pop()
+            x = F.interpolate(m(x), size=[h, w], mode='bilinear', align_corners=False)
+            side_outputs.insert(0, x)
+
+        x = self.out_conv(torch.concat(side_outputs, dim=1))
+
+        if self.training:
+            # do not use torch.sigmoid for amp safe
+            return [x] + side_outputs
+        else:
+            return torch.sigmoid(x)
+
+
+def u2net_full(out_ch: int = 1):
+    cfg = {
+        # height, in_ch, mid_ch, out_ch, RSU4F, side
+        "encode": [[7, 3, 32, 64, False, False],      # En1
+                   [6, 64, 32, 128, False, False],    # En2
+                   [5, 128, 64, 256, False, False],   # En3
+                   [4, 256, 128, 512, False, False],  # En4
+                   [4, 512, 256, 512, True, False],   # En5
+                   [4, 512, 256, 512, True, True]],   # En6
+        # height, in_ch, mid_ch, out_ch, RSU4F, side
+        "decode": [[4, 1024, 256, 512, True, True],   # De5
+                   [4, 1024, 128, 256, False, True],  # De4
+                   [5, 512, 64, 128, False, True],    # De3
+                   [6, 256, 32, 64, False, True],     # De2
+                   [7, 128, 16, 64, False, True]]     # De1
+    }
+
+    return U2Net(cfg, out_ch)
+
+
+def u2net_lite(out_ch: int = 1):
+    cfg = {
+        # height, in_ch, mid_ch, out_ch, RSU4F, side
+        "encode": [[7, 3, 16, 64, False, False],  # En1
+                   [6, 64, 16, 64, False, False],  # En2
+                   [5, 64, 16, 64, False, False],  # En3
+                   [4, 64, 16, 64, False, False],  # En4
+                   [4, 64, 16, 64, True, False],  # En5
+                   [4, 64, 16, 64, True, True]],  # En6
+        # height, in_ch, mid_ch, out_ch, RSU4F, side
+        "decode": [[4, 128, 16, 64, True, True],  # De5
+                   [4, 128, 16, 64, False, True],  # De4
+                   [5, 128, 16, 64, False, True],  # De3
+                   [6, 128, 16, 64, False, True],  # De2
+                   [7, 128, 16, 64, False, True]]  # De1
+    }
+
+    return U2Net(cfg, out_ch)
+
+
+def convert_onnx(m, save_path):
+    m.eval()
+    x = torch.rand(1, 3, 288, 288, requires_grad=True)
+
+    # export the model
+    torch.onnx.export(m,  # model being run
+                      x,  # model input (or a tuple for multiple inputs)
+                      save_path,  # where to save the model (can be a file or file-like object)
+                      export_params=True,
+                      opset_version=11)
+
+
+if __name__ == '__main__':
+    # n_m = RSU(height=7, in_ch=3, mid_ch=12, out_ch=3)
+    # convert_onnx(n_m, "RSU7.onnx")
+    #
+    # n_m = RSU4F(in_ch=3, mid_ch=12, out_ch=3)
+    # convert_onnx(n_m, "RSU4F.onnx")
+
+    u2net = u2net_full()
+    convert_onnx(u2net, "u2net_full.onnx")
diff --git a/pytorch_segmentation/u2net/train.py b/pytorch_segmentation/u2net/train.py
new file mode 100644
index 000000000..4ccbf96d4
--- /dev/null
+++ b/pytorch_segmentation/u2net/train.py
@@ -0,0 +1,160 @@
+import os
+import time
+import datetime
+from typing import Union, List
+
+import torch
+from torch.utils import data
+
+from src import u2net_full
+from train_utils import train_one_epoch, evaluate, get_params_groups, create_lr_scheduler
+from my_dataset import DUTSDataset
+import transforms as T
+
+
+class SODPresetTrain:
+    def __init__(self, base_size: Union[int, List[int]], crop_size: int,
+                 hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
+        self.transforms = T.Compose([
+            T.ToTensor(),
+            T.Resize(base_size, resize_mask=True),
+            T.RandomCrop(crop_size),
+            T.RandomHorizontalFlip(hflip_prob),
+            T.Normalize(mean=mean, std=std)
+        ])
+
+    def __call__(self, img, target):
+        return self.transforms(img, target)
+
+
+class SODPresetEval:
+    def __init__(self, base_size: Union[int, List[int]], mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
+        self.transforms = T.Compose([
+            T.ToTensor(),
+            T.Resize(base_size, resize_mask=False),
+            T.Normalize(mean=mean, std=std),
+        ])
+
+    def __call__(self, img, target):
+        return self.transforms(img, target)
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    batch_size = args.batch_size
+
+    # 用来保存训练以及验证过程中信息
+    results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
+
+    train_dataset = DUTSDataset(args.data_path, train=True, transforms=SODPresetTrain([320, 320], crop_size=288))
+    val_dataset = DUTSDataset(args.data_path, train=False, transforms=SODPresetEval([320, 320]))
+
+    num_workers = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])
+    train_data_loader = data.DataLoader(train_dataset,
+                                        batch_size=batch_size,
+                                        num_workers=num_workers,
+                                        shuffle=True,
+                                        pin_memory=True,
+                                        collate_fn=train_dataset.collate_fn)
+
+    val_data_loader = data.DataLoader(val_dataset,
+                                      batch_size=1,  # must be 1
+                                      num_workers=num_workers,
+                                      pin_memory=True,
+                                      collate_fn=val_dataset.collate_fn)
+
+    model = u2net_full()
+    model.to(device)
+
+    params_group = get_params_groups(model, weight_decay=args.weight_decay)
+    optimizer = torch.optim.AdamW(params_group, lr=args.lr, weight_decay=args.weight_decay)
+    lr_scheduler = create_lr_scheduler(optimizer, len(train_data_loader), args.epochs,
+                                       warmup=True, warmup_epochs=2)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp:
+            scaler.load_state_dict(checkpoint["scaler"])
+
+    current_mae, current_f1 = 1.0, 0.0
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        mean_loss, lr = train_one_epoch(model, optimizer, train_data_loader, device, epoch,
+                                        lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler)
+
+        save_file = {"model": model.state_dict(),
+                     "optimizer": optimizer.state_dict(),
+                     "lr_scheduler": lr_scheduler.state_dict(),
+                     "epoch": epoch,
+                     "args": args}
+        if args.amp:
+            save_file["scaler"] = scaler.state_dict()
+
+        if epoch % args.eval_interval == 0 or epoch == args.epochs - 1:
+            # 每间隔eval_interval个epoch验证一次，减少验证频率节省训练时间
+            mae_metric, f1_metric = evaluate(model, val_data_loader, device=device)
+            mae_info, f1_info = mae_metric.compute(), f1_metric.compute()
+            print(f"[epoch: {epoch}] val_MAE: {mae_info:.3f} val_maxF1: {f1_info:.3f}")
+            # write into txt
+            with open(results_file, "a") as f:
+                # 记录每个epoch对应的train_loss、lr以及验证集各指标
+                write_info = f"[epoch: {epoch}] train_loss: {mean_loss:.4f} lr: {lr:.6f} " \
+                             f"MAE: {mae_info:.3f} maxF1: {f1_info:.3f} \n"
+                f.write(write_info)
+
+            # save_best
+            if current_mae >= mae_info and current_f1 <= f1_info:
+                torch.save(save_file, "save_weights/model_best.pth")
+
+        # only save latest 10 epoch weights
+        if os.path.exists(f"save_weights/model_{epoch-10}.pth"):
+            os.remove(f"save_weights/model_{epoch-10}.pth")
+
+        torch.save(save_file, f"save_weights/model_{epoch}.pth")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print("training time {}".format(total_time_str))
+
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="pytorch u2net training")
+
+    parser.add_argument("--data-path", default="./", help="DUTS root")
+    parser.add_argument("--device", default="cuda", help="training device")
+    parser.add_argument("-b", "--batch-size", default=16, type=int)
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    parser.add_argument("--epochs", default=360, type=int, metavar="N",
+                        help="number of total epochs to train")
+    parser.add_argument("--eval-interval", default=10, type=int, help="validation interval default 10 Epochs")
+
+    parser.add_argument('--lr', default=0.001, type=float, help='initial learning rate')
+    parser.add_argument('--print-freq', default=50, type=int, help='print frequency')
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    # Mixed precision training parameters
+    parser.add_argument("--amp", action='http://github.com/store_true',
+                        help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if not os.path.exists("./save_weights"):
+        os.mkdir("./save_weights")
+
+    main(args)
diff --git a/pytorch_segmentation/u2net/train_multi_GPU.py b/pytorch_segmentation/u2net/train_multi_GPU.py
new file mode 100644
index 000000000..1a62a0ec7
--- /dev/null
+++ b/pytorch_segmentation/u2net/train_multi_GPU.py
@@ -0,0 +1,224 @@
+import time
+import os
+import datetime
+from typing import Union, List
+
+import torch
+from torch.utils import data
+
+from src import u2net_full
+from train_utils import (train_one_epoch, evaluate, init_distributed_mode, save_on_master, mkdir,
+                         create_lr_scheduler, get_params_groups)
+from my_dataset import DUTSDataset
+import transforms as T
+
+
+class SODPresetTrain:
+    def __init__(self, base_size: Union[int, List[int]], crop_size: int,
+                 hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
+        self.transforms = T.Compose([
+            T.ToTensor(),
+            T.Resize(base_size, resize_mask=True),
+            T.RandomCrop(crop_size),
+            T.RandomHorizontalFlip(hflip_prob),
+            T.Normalize(mean=mean, std=std)
+        ])
+
+    def __call__(self, img, target):
+        return self.transforms(img, target)
+
+
+class SODPresetEval:
+    def __init__(self, base_size: Union[int, List[int]], mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
+        self.transforms = T.Compose([
+            T.ToTensor(),
+            T.Resize(base_size, resize_mask=False),
+            T.Normalize(mean=mean, std=std),
+        ])
+
+    def __call__(self, img, target):
+        return self.transforms(img, target)
+
+
+def main(args):
+    init_distributed_mode(args)
+    print(args)
+
+    device = torch.device(args.device)
+
+    # 用来保存训练以及验证过程中信息
+    results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
+
+    train_dataset = DUTSDataset(args.data_path, train=True, transforms=SODPresetTrain([320, 320], crop_size=288))
+    val_dataset = DUTSDataset(args.data_path, train=False, transforms=SODPresetEval([320, 320]))
+
+    print("Creating data loaders")
+    if args.distributed:
+        train_sampler = data.distributed.DistributedSampler(train_dataset)
+        test_sampler = data.distributed.DistributedSampler(val_dataset)
+    else:
+        train_sampler = data.RandomSampler(train_dataset)
+        test_sampler = data.SequentialSampler(val_dataset)
+
+    train_data_loader = data.DataLoader(
+        train_dataset, batch_size=args.batch_size,
+        sampler=train_sampler, num_workers=args.workers,
+        pin_memory=True, collate_fn=train_dataset.collate_fn, drop_last=True)
+
+    val_data_loader = data.DataLoader(
+        val_dataset, batch_size=1,  # batch_size must be 1
+        sampler=test_sampler, num_workers=args.workers,
+        pin_memory=True, collate_fn=train_dataset.collate_fn)
+
+    # create model num_classes equal background + 20 classes
+    model = u2net_full()
+    model.to(device)
+
+    if args.sync_bn:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+
+    params_group = get_params_groups(model, weight_decay=args.weight_decay)
+    optimizer = torch.optim.AdamW(params_group, lr=args.lr, weight_decay=args.weight_decay)
+    lr_scheduler = create_lr_scheduler(optimizer, len(train_data_loader), args.epochs,
+                                       warmup=True, warmup_epochs=2)
+
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+
+    # 如果传入resume参数，即上次训练的权重地址，则接着上次的参数训练
+    if args.resume:
+        # If map_location is missing, torch.load will first load the module to CPU
+        # and then copy each parameter to where it was saved,
+        # which would result in all processes on the same machine using the same set of devices.
+        checkpoint = torch.load(args.resume, map_location='cpu')  # 读取之前保存的权重文件(包括优化器以及学习率策略)
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+        if args.amp:
+            scaler.load_state_dict(checkpoint["scaler"])
+
+    if args.test_only:
+        mae_metric, f1_metric = evaluate(model, val_data_loader, device=device)
+        print(mae_metric, f1_metric)
+        return
+
+    print("Start training")
+    current_mae, current_f1 = 1.0, 0.0
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+
+        mean_loss, lr = train_one_epoch(model, optimizer, train_data_loader, device, epoch,
+                                        lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler)
+
+        save_file = {'model': model_without_ddp.state_dict(),
+                     'optimizer': optimizer.state_dict(),
+                     "lr_scheduler": lr_scheduler.state_dict(),
+                     'args': args,
+                     'epoch': epoch}
+        if args.amp:
+            save_file["scaler"] = scaler.state_dict()
+
+        if epoch % args.eval_interval == 0 or epoch == args.epochs - 1:
+            # 每间隔eval_interval个epoch验证一次，减少验证频率节省训练时间
+            mae_metric, f1_metric = evaluate(model, val_data_loader, device=device)
+            mae_info, f1_info = mae_metric.compute(), f1_metric.compute()
+            print(f"[epoch: {epoch}] val_MAE: {mae_info:.3f} val_maxF1: {f1_info:.3f}")
+
+            # 只在主进程上进行写操作
+            if args.rank in [-1, 0]:
+                # write into txt
+                with open(results_file, "a") as f:
+                    # 记录每个epoch对应的train_loss、lr以及验证集各指标
+                    write_info = f"[epoch: {epoch}] train_loss: {mean_loss:.4f} lr: {lr:.6f} " \
+                                 f"MAE: {mae_info:.3f} maxF1: {f1_info:.3f} \n"
+                    f.write(write_info)
+
+                # save_best
+                if current_mae >= mae_info and current_f1 <= f1_info:
+                    if args.output_dir:
+                        # 只在主节点上执行保存权重操作
+                        save_on_master(save_file,
+                                       os.path.join(args.output_dir, 'model_best.pth'))
+
+        if args.output_dir:
+            if args.rank in [-1, 0]:
+                # only save latest 10 epoch weights
+                if os.path.exists(os.path.join(args.output_dir, f'model_{epoch - 10}.pth')):
+                    os.remove(os.path.join(args.output_dir, f'model_{epoch - 10}.pth'))
+
+            # 只在主节点上执行保存权重操作
+            save_on_master(save_file,
+                           os.path.join(args.output_dir, f'model_{epoch}.pth'))
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    # 训练文件的根目录(VOCdevkit)
+    parser.add_argument('--data-path', default='./', help='DUTS root')
+    # 训练设备类型
+    parser.add_argument('--device', default='cuda', help='device')
+    # 每块GPU上的batch_size
+    parser.add_argument('-b', '--batch-size', default=16, type=int,
+                        help='images per gpu, the total batch size is $NGPU x batch_size')
+    # 指定接着从哪个epoch数开始训练
+    parser.add_argument('--start-epoch', default=0, type=int, help='start epoch')
+    # 训练的总epoch数
+    parser.add_argument('--epochs', default=360, type=int, metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    # 是否使用同步BN(在多个GPU之间同步)，默认不开启，开启后训练速度会变慢
+    parser.add_argument('--sync-bn', action='http://github.com/store_true', help='whether using SyncBatchNorm')
+    # 数据加载以及预处理的线程数
+    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                        help='number of data loading workers (default: 4)')
+    # 训练学习率
+    parser.add_argument('--lr', default=0.001, type=float,
+                        help='initial learning rate')
+    # 验证频率
+    parser.add_argument("--eval-interval", default=10, type=int, help="validation interval default 10 Epochs")
+    # 训练过程打印信息的频率
+    parser.add_argument('--print-freq', default=20, type=int, help='print frequency')
+    # 文件保存地址
+    parser.add_argument('--output-dir', default='./multi_train', help='path where to save')
+    # 基于上次的训练结果接着训练
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    # 不训练，仅测试
+    parser.add_argument(
+        "--test-only",
+        dest="test_only",
+        help="Only test the model",
+        action="http://github.com/store_true",
+    )
+
+    # 分布式进程数
+    parser.add_argument('--world-size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+    # Mixed precision training parameters
+    parser.add_argument("--amp", action='http://github.com/store_true',
+                        help="Use torch.cuda.amp for mixed precision training")
+
+    args = parser.parse_args()
+
+    # 如果指定了保存文件地址，检查文件夹是否存在，若不存在，则创建
+    if args.output_dir:
+        mkdir(args.output_dir)
+
+    main(args)
diff --git a/pytorch_segmentation/u2net/train_utils/__init__.py b/pytorch_segmentation/u2net/train_utils/__init__.py
new file mode 100644
index 000000000..dfe313dd8
--- /dev/null
+++ b/pytorch_segmentation/u2net/train_utils/__init__.py
@@ -0,0 +1,2 @@
+from .train_and_eval import train_one_epoch, evaluate, create_lr_scheduler, get_params_groups
+from .distributed_utils import init_distributed_mode, save_on_master, mkdir
diff --git a/pytorch_segmentation/u2net/train_utils/distributed_utils.py b/pytorch_segmentation/u2net/train_utils/distributed_utils.py
new file mode 100644
index 000000000..c9bfebb5d
--- /dev/null
+++ b/pytorch_segmentation/u2net/train_utils/distributed_utils.py
@@ -0,0 +1,356 @@
+from collections import defaultdict, deque
+import datetime
+import time
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+import errno
+import os
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{value:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    收集各个进程中的数据
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()  # 进程数
+    if world_size == 1:
+        return [data]
+
+    data_list = [None] * world_size
+    dist.all_gather_object(data_list, data)
+
+    return data_list
+
+
+class MeanAbsoluteError(object):
+    def __init__(self):
+        self.mae_list = []
+
+    def update(self, pred: torch.Tensor, gt: torch.Tensor):
+        batch_size, c, h, w = gt.shape
+        assert batch_size == 1, f"validation mode batch_size must be 1, but got batch_size: {batch_size}."
+        resize_pred = F.interpolate(pred, (h, w), mode="bilinear", align_corners=False)
+        error_pixels = torch.sum(torch.abs(resize_pred - gt), dim=(1, 2, 3)) / (h * w)
+        self.mae_list.extend(error_pixels.tolist())
+
+    def compute(self):
+        mae = sum(self.mae_list) / len(self.mae_list)
+        return mae
+
+    def gather_from_all_processes(self):
+        if not torch.distributed.is_available():
+            return
+        if not torch.distributed.is_initialized():
+            return
+        torch.distributed.barrier()
+        gather_mae_list = []
+        for i in all_gather(self.mae_list):
+            gather_mae_list.extend(i)
+        self.mae_list = gather_mae_list
+
+    def __str__(self):
+        mae = self.compute()
+        return f'MAE: {mae:.3f}'
+
+
+class F1Score(object):
+    """
+    refer: https://github.com/xuebinqin/DIS/blob/main/IS-Net/basics.py
+    """
+
+    def __init__(self, threshold: float = 0.5):
+        self.precision_cum = None
+        self.recall_cum = None
+        self.num_cum = None
+        self.threshold = threshold
+
+    def update(self, pred: torch.Tensor, gt: torch.Tensor):
+        batch_size, c, h, w = gt.shape
+        assert batch_size == 1, f"validation mode batch_size must be 1, but got batch_size: {batch_size}."
+        resize_pred = F.interpolate(pred, (h, w), mode="bilinear", align_corners=False)
+        gt_num = torch.sum(torch.gt(gt, self.threshold).float())
+
+        pp = resize_pred[torch.gt(gt, self.threshold)]  # 对应预测map中GT为前景的区域
+        nn = resize_pred[torch.le(gt, self.threshold)]  # 对应预测map中GT为背景的区域
+
+        pp_hist = torch.histc(pp, bins=255, min=0.0, max=1.0)
+        nn_hist = torch.histc(nn, bins=255, min=0.0, max=1.0)
+
+        # Sort according to the prediction probability from large to small
+        pp_hist_flip = torch.flipud(pp_hist)
+        nn_hist_flip = torch.flipud(nn_hist)
+
+        pp_hist_flip_cum = torch.cumsum(pp_hist_flip, dim=0)
+        nn_hist_flip_cum = torch.cumsum(nn_hist_flip, dim=0)
+
+        precision = pp_hist_flip_cum / (pp_hist_flip_cum + nn_hist_flip_cum + 1e-4)
+        recall = pp_hist_flip_cum / (gt_num + 1e-4)
+
+        if self.precision_cum is None:
+            self.precision_cum = torch.full_like(precision, fill_value=0.)
+
+        if self.recall_cum is None:
+            self.recall_cum = torch.full_like(recall, fill_value=0.)
+
+        if self.num_cum is None:
+            self.num_cum = torch.zeros([1], dtype=gt.dtype, device=gt.device)
+
+        self.precision_cum += precision
+        self.recall_cum += recall
+        self.num_cum += batch_size
+
+    def compute(self):
+        pre_mean = self.precision_cum / self.num_cum
+        rec_mean = self.recall_cum / self.num_cum
+        f1_mean = (1 + 0.3) * pre_mean * rec_mean / (0.3 * pre_mean + rec_mean + 1e-8)
+        max_f1 = torch.amax(f1_mean).item()
+        return max_f1
+
+    def reduce_from_all_processes(self):
+        if not torch.distributed.is_available():
+            return
+        if not torch.distributed.is_initialized():
+            return
+        torch.distributed.barrier()
+        torch.distributed.all_reduce(self.precision_cum)
+        torch.distributed.all_reduce(self.recall_cum)
+        torch.distributed.all_reduce(self.num_cum)
+
+    def __str__(self):
+        max_f1 = self.compute()
+        return f'maxF1: {max_f1:.3f}'
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {}'.format(header, total_time_str))
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    elif hasattr(args, "rank"):
+        pass
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    setup_for_distributed(args.rank == 0)
diff --git a/pytorch_segmentation/u2net/train_utils/train_and_eval.py b/pytorch_segmentation/u2net/train_utils/train_and_eval.py
new file mode 100644
index 000000000..3ff1150aa
--- /dev/null
+++ b/pytorch_segmentation/u2net/train_utils/train_and_eval.py
@@ -0,0 +1,111 @@
+import math
+import torch
+from torch.nn import functional as F
+import train_utils.distributed_utils as utils
+
+
+def criterion(inputs, target):
+    losses = [F.binary_cross_entropy_with_logits(inputs[i], target) for i in range(len(inputs))]
+    total_loss = sum(losses)
+
+    return total_loss
+
+
+def evaluate(model, data_loader, device):
+    model.eval()
+    mae_metric = utils.MeanAbsoluteError()
+    f1_metric = utils.F1Score()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+    with torch.no_grad():
+        for images, targets in metric_logger.log_every(data_loader, 100, header):
+            images, targets = images.to(device), targets.to(device)
+            output = model(images)
+
+            # post norm
+            # ma = torch.max(output)
+            # mi = torch.min(output)
+            # output = (output - mi) / (ma - mi)
+
+            mae_metric.update(output, targets)
+            f1_metric.update(output, targets)
+
+        mae_metric.gather_from_all_processes()
+        f1_metric.reduce_from_all_processes()
+
+    return mae_metric, f1_metric
+
+
+def train_one_epoch(model, optimizer, data_loader, device, epoch, lr_scheduler, print_freq=10, scaler=None):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    for image, target in metric_logger.log_every(data_loader, print_freq, header):
+        image, target = image.to(device), target.to(device)
+        with torch.cuda.amp.autocast(enabled=scaler is not None):
+            output = model(image)
+            loss = criterion(output, target)
+
+        optimizer.zero_grad()
+        if scaler is not None:
+            scaler.scale(loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            optimizer.step()
+
+        lr_scheduler.step()
+
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(loss=loss.item(), lr=lr)
+
+    return metric_logger.meters["loss"].global_avg, lr
+
+
+def create_lr_scheduler(optimizer,
+                        num_step: int,
+                        epochs: int,
+                        warmup=True,
+                        warmup_epochs=1,
+                        warmup_factor=1e-3,
+                        end_factor=1e-6):
+    assert num_step > 0 and epochs > 0
+    if warmup is False:
+        warmup_epochs = 0
+
+    def f(x):
+        """
+        根据step数返回一个学习率倍率因子，
+        注意在训练开始之前，pytorch会提前调用一次lr_scheduler.step()方法
+        """
+        if warmup is True and x <= (warmup_epochs * num_step):
+            alpha = float(x) / (warmup_epochs * num_step)
+            # warmup过程中lr倍率因子从warmup_factor -> 1
+            return warmup_factor * (1 - alpha) + alpha
+        else:
+            current_step = (x - warmup_epochs * num_step)
+            cosine_steps = (epochs - warmup_epochs) * num_step
+            # warmup后lr倍率因子从1 -> end_factor
+            return ((1 + math.cos(current_step * math.pi / cosine_steps)) / 2) * (1 - end_factor) + end_factor
+
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)
+
+
+def get_params_groups(model: torch.nn.Module, weight_decay: float = 1e-4):
+    params_group = [{"params": [], "weight_decay": 0.},  # no decay
+                    {"params": [], "weight_decay": weight_decay}]  # with decay
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+
+        if len(param.shape) == 1 or name.endswith(".bias"):
+            # bn:(weight,bias)  conv2d:(bias)  linear:(bias)
+            params_group[0]["params"].append(param)  # no decay
+        else:
+            params_group[1]["params"].append(param)  # with decay
+
+    return params_group
diff --git a/pytorch_segmentation/u2net/transforms.py b/pytorch_segmentation/u2net/transforms.py
new file mode 100644
index 000000000..230b0fb87
--- /dev/null
+++ b/pytorch_segmentation/u2net/transforms.py
@@ -0,0 +1,79 @@
+import random
+from typing import List, Union
+from torchvision.transforms import functional as F
+from torchvision.transforms import transforms as T
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target=None):
+        for t in self.transforms:
+            image, target = t(image, target)
+
+        return image, target
+
+
+class ToTensor(object):
+    def __call__(self, image, target):
+        image = F.to_tensor(image)
+        target = F.to_tensor(target)
+        return image, target
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, prob):
+        self.flip_prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.flip_prob:
+            image = F.hflip(image)
+            target = F.hflip(target)
+        return image, target
+
+
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        return image, target
+
+
+class Resize(object):
+    def __init__(self, size: Union[int, List[int]], resize_mask: bool = True):
+        self.size = size  # [h, w]
+        self.resize_mask = resize_mask
+
+    def __call__(self, image, target=None):
+        image = F.resize(image, self.size)
+        if self.resize_mask is True:
+            target = F.resize(target, self.size)
+
+        return image, target
+
+
+class RandomCrop(object):
+    def __init__(self, size: int):
+        self.size = size
+
+    def pad_if_smaller(self, img, fill=0):
+        # 如果图像最小边长小于给定size，则用数值fill进行padding
+        min_size = min(img.shape[-2:])
+        if min_size < self.size:
+            ow, oh = img.size
+            padh = self.size - oh if oh < self.size else 0
+            padw = self.size - ow if ow < self.size else 0
+            img = F.pad(img, [0, 0, padw, padh], fill=fill)
+        return img
+
+    def __call__(self, image, target):
+        image = self.pad_if_smaller(image)
+        target = self.pad_if_smaller(target)
+        crop_params = T.RandomCrop.get_params(image, (self.size, self.size))
+        image = F.crop(image, *crop_params)
+        target = F.crop(target, *crop_params)
+        return image, target
diff --git a/pytorch_segmentation/u2net/u2net.png b/pytorch_segmentation/u2net/u2net.png
new file mode 100644
index 000000000..61b4cba34
Binary files /dev/null and b/pytorch_segmentation/u2net/u2net.png differ
diff --git a/pytorch_segmentation/u2net/validation.py b/pytorch_segmentation/u2net/validation.py
new file mode 100644
index 000000000..0c1b4e224
--- /dev/null
+++ b/pytorch_segmentation/u2net/validation.py
@@ -0,0 +1,67 @@
+import os
+from typing import Union, List
+
+import torch
+from torch.utils import data
+
+from src import u2net_full
+from train_utils import evaluate
+from my_dataset import DUTSDataset
+import transforms as T
+
+
+class SODPresetEval:
+    def __init__(self, base_size: Union[int, List[int]], mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
+        self.transforms = T.Compose([
+            T.ToTensor(),
+            T.Resize(base_size, resize_mask=False),
+            T.Normalize(mean=mean, std=std),
+        ])
+
+    def __call__(self, img, target):
+        return self.transforms(img, target)
+
+
+def main(args):
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    assert os.path.exists(args.weights), f"weights {args.weights} not found."
+
+    val_dataset = DUTSDataset(args.data_path, train=False, transforms=SODPresetEval([320, 320]))
+
+    num_workers = 4
+    val_data_loader = data.DataLoader(val_dataset,
+                                      batch_size=1,  # must be 1
+                                      num_workers=num_workers,
+                                      pin_memory=True,
+                                      shuffle=False,
+                                      collate_fn=val_dataset.collate_fn)
+
+    model = u2net_full()
+    pretrain_weights = torch.load(args.weights, map_location='cpu')
+    if "model" in pretrain_weights:
+        model.load_state_dict(pretrain_weights["model"])
+    else:
+        model.load_state_dict(pretrain_weights)
+    model.to(device)
+
+    mae_metric, f1_metric = evaluate(model, val_data_loader, device=device)
+    print(mae_metric, f1_metric)
+
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="pytorch u2net validation")
+
+    parser.add_argument("--data-path", default="./", help="DUTS root")
+    parser.add_argument("--weights", default="./u2net_full.pth")
+    parser.add_argument("--device", default="cuda:0", help="training device")
+    parser.add_argument('--print-freq', default=10, type=int, help='print frequency')
+
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/pytorch_segmentation/unet/README.md b/pytorch_segmentation/unet/README.md
index 115bce4a2..8576783df 100644
--- a/pytorch_segmentation/unet/README.md
+++ b/pytorch_segmentation/unet/README.md
@@ -47,7 +47,7 @@
 
 
 ## 进一步了解该项目，以及对U-Net代码的分析可参考我的bilibili
-
+* [https://b23.tv/PCJJmqN](https://b23.tv/PCJJmqN)
 
 ## 本项目U-Net默认使用双线性插值做为上采样，结构图如下
-![u-net](unet.png)
\ No newline at end of file
+![u-net](unet.png)
diff --git a/pytorch_segmentation/unet/my_dataset.py b/pytorch_segmentation/unet/my_dataset.py
index d11e1217f..969859d4f 100644
--- a/pytorch_segmentation/unet/my_dataset.py
+++ b/pytorch_segmentation/unet/my_dataset.py
@@ -7,9 +7,9 @@
 class DriveDataset(Dataset):
     def __init__(self, root: str, train: bool, transforms=None):
         super(DriveDataset, self).__init__()
-        data_root = os.path.join(root, "DRIVE", "training" if train else "test")
-        assert os.path.exists(data_root), f"path '{data_root}' does not exists."
         self.flag = "training" if train else "test"
+        data_root = os.path.join(root, "DRIVE", self.flag)
+        assert os.path.exists(data_root), f"path '{data_root}' does not exists."
         self.transforms = transforms
         img_names = [i for i in os.listdir(os.path.join(data_root, "images")) if i.endswith(".tif")]
         self.img_list = [os.path.join(data_root, "images", i) for i in img_names]
@@ -18,14 +18,14 @@ def __init__(self, root: str, train: bool, transforms=None):
         # check files
         for i in self.manual:
             if os.path.exists(i) is False:
-                print(f"file {i} does not exists.")
+                raise FileNotFoundError(f"file {i} does not exists.")
 
         self.roi_mask = [os.path.join(data_root, "mask", i.split("_")[0] + f"_{self.flag}_mask.gif")
                          for i in img_names]
         # check files
         for i in self.roi_mask:
             if os.path.exists(i) is False:
-                print(f"file {i} does not exists.")
+                raise FileNotFoundError(f"file {i} does not exists.")
 
     def __getitem__(self, idx):
         img = Image.open(self.img_list[idx]).convert('RGB')
diff --git a/pytorch_segmentation/unet/predict.py b/pytorch_segmentation/unet/predict.py
index 2e1e1b9a9..c7d557fa7 100644
--- a/pytorch_segmentation/unet/predict.py
+++ b/pytorch_segmentation/unet/predict.py
@@ -61,7 +61,7 @@ def main():
         t_start = time_synchronized()
         output = model(img.to(device))
         t_end = time_synchronized()
-        print("inference+NMS time: {}".format(t_end - t_start))
+        print("inference time: {}".format(t_end - t_start))
 
         prediction = output['out'].argmax(1).squeeze(0)
         prediction = prediction.to("cpu").numpy().astype(np.uint8)
diff --git a/pytorch_segmentation/unet/requirements.txt b/pytorch_segmentation/unet/requirements.txt
index 50b913cfc..2c58f889e 100644
--- a/pytorch_segmentation/unet/requirements.txt
+++ b/pytorch_segmentation/unet/requirements.txt
@@ -1,4 +1,4 @@
-numpy==1.21.3
-torch==1.10.0
+numpy==1.22.0
+torch==1.13.1
 torchvision==0.11.1
-Pillow==8.4.0
\ No newline at end of file
+Pillow
diff --git a/pytorch_segmentation/unet/src/mobilenet_unet.py b/pytorch_segmentation/unet/src/mobilenet_unet.py
index 859e847ba..aff981864 100644
--- a/pytorch_segmentation/unet/src/mobilenet_unet.py
+++ b/pytorch_segmentation/unet/src/mobilenet_unet.py
@@ -88,7 +88,7 @@ def __init__(self, num_classes, pretrain_backbone: bool = False):
         self.up4 = Up(c, self.stage_out_channels[0])
         self.conv = OutConv(self.stage_out_channels[0], num_classes=num_classes)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
         input_shape = x.shape[-2:]
         backbone_out = self.backbone(x)
         x = self.up1(backbone_out['stage4'], backbone_out['stage3'])
diff --git a/pytorch_segmentation/unet/src/unet.py b/pytorch_segmentation/unet/src/unet.py
index 0b50af243..31717aea8 100644
--- a/pytorch_segmentation/unet/src/unet.py
+++ b/pytorch_segmentation/unet/src/unet.py
@@ -1,3 +1,4 @@
+from typing import Dict
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -35,7 +36,7 @@ def __init__(self, in_channels, out_channels, bilinear=True):
             self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
             self.conv = DoubleConv(in_channels, out_channels)
 
-    def forward(self, x1, x2):
+    def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
         x1 = self.up(x1)
         # [N, C, H, W]
         diff_y = x2.size()[2] - x1.size()[2]
@@ -80,7 +81,7 @@ def __init__(self,
         self.up4 = Up(base_c * 2, base_c, bilinear)
         self.out_conv = OutConv(base_c, num_classes)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
         x1 = self.in_conv(x)
         x2 = self.down1(x1)
         x3 = self.down2(x2)
diff --git a/pytorch_segmentation/unet/src/vgg_unet.py b/pytorch_segmentation/unet/src/vgg_unet.py
index 830af075a..44a21e911 100644
--- a/pytorch_segmentation/unet/src/vgg_unet.py
+++ b/pytorch_segmentation/unet/src/vgg_unet.py
@@ -88,7 +88,7 @@ def __init__(self, num_classes, pretrain_backbone: bool = False):
         self.up4 = Up(c, self.stage_out_channels[0])
         self.conv = OutConv(self.stage_out_channels[0], num_classes=num_classes)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
         backbone_out = self.backbone(x)
         x = self.up1(backbone_out['stage4'], backbone_out['stage3'])
         x = self.up2(x, backbone_out['stage2'])
diff --git a/pytorch_segmentation/unet/train.py b/pytorch_segmentation/unet/train.py
index 21ccade3b..2ac065016 100644
--- a/pytorch_segmentation/unet/train.py
+++ b/pytorch_segmentation/unet/train.py
@@ -169,7 +169,7 @@ def parse_args():
     parser.add_argument("--num-classes", default=1, type=int)
     parser.add_argument("--device", default="cuda", help="training device")
     parser.add_argument("-b", "--batch-size", default=4, type=int)
-    parser.add_argument("--epochs", default=100, type=int, metavar="N",
+    parser.add_argument("--epochs", default=200, type=int, metavar="N",
                         help="number of total epochs to train")
 
     parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate')
diff --git a/pytorch_segmentation/unet/train_multi_GPU.py b/pytorch_segmentation/unet/train_multi_GPU.py
index 8a7007609..11b76ace3 100644
--- a/pytorch_segmentation/unet/train_multi_GPU.py
+++ b/pytorch_segmentation/unet/train_multi_GPU.py
@@ -217,7 +217,7 @@ def main(args):
     # 指定接着从哪个epoch数开始训练
     parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
     # 训练的总epoch数
-    parser.add_argument('--epochs', default=100, type=int, metavar='N',
+    parser.add_argument('--epochs', default=200, type=int, metavar='N',
                         help='number of total epochs to run')
     # 是否使用同步BN(在多个GPU之间同步)，默认不开启，开启后训练速度会变慢
     parser.add_argument('--sync_bn', type=bool, default=False, help='whether using SyncBatchNorm')
diff --git a/pytorch_segmentation/unet/train_utils/distributed_utils.py b/pytorch_segmentation/unet/train_utils/distributed_utils.py
index 6f044f511..577d5ea3b 100644
--- a/pytorch_segmentation/unet/train_utils/distributed_utils.py
+++ b/pytorch_segmentation/unet/train_utils/distributed_utils.py
@@ -130,11 +130,13 @@ def __init__(self, num_classes: int = 2, ignore_index: int = -100):
         self.cumulative_dice = None
         self.num_classes = num_classes
         self.ignore_index = ignore_index
-        self.count = 0
+        self.count = None
 
     def update(self, pred, target):
         if self.cumulative_dice is None:
             self.cumulative_dice = torch.zeros(1, dtype=pred.dtype, device=pred.device)
+        if self.count is None:
+            self.count = torch.zeros(1, dtype=pred.dtype, device=pred.device)
         # compute the Dice score, ignoring background
         pred = F.one_hot(pred.argmax(dim=1), self.num_classes).permute(0, 3, 1, 2).float()
         dice_target = build_target(target, self.num_classes, self.ignore_index)
@@ -149,10 +151,12 @@ def value(self):
             return self.cumulative_dice / self.count
 
     def reset(self):
-        self.count = 0
         if self.cumulative_dice is not None:
             self.cumulative_dice.zero_()
 
+        if self.count is not None:
+            self.count.zeros_()
+
     def reduce_from_all_processes(self):
         if not torch.distributed.is_available():
             return
@@ -160,6 +164,7 @@ def reduce_from_all_processes(self):
             return
         torch.distributed.barrier()
         torch.distributed.all_reduce(self.cumulative_dice)
+        torch.distributed.all_reduce(self.count)
 
 
 class MetricLogger(object):
diff --git a/pytorch_segmentation/unet/unet.png b/pytorch_segmentation/unet/unet.png
index a9d874fa9..2107e8bc3 100644
Binary files a/pytorch_segmentation/unet/unet.png and b/pytorch_segmentation/unet/unet.png differ
diff --git a/tensorflow_classification/ConvNeXt/model.py b/tensorflow_classification/ConvNeXt/model.py
new file mode 100644
index 000000000..f1893eb72
--- /dev/null
+++ b/tensorflow_classification/ConvNeXt/model.py
@@ -0,0 +1,214 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers, initializers, Model
+
+KERNEL_INITIALIZER = {
+    "class_name": "TruncatedNormal",
+    "config": {
+        "stddev": 0.2
+    }
+}
+
+BIAS_INITIALIZER = "Zeros"
+
+
+class Block(layers.Layer):
+    """
+    Args:
+        dim (int): Number of input channels.
+        drop_rate (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_rate=0., layer_scale_init_value=1e-6, name: str = None):
+        super().__init__(name=name)
+        self.layer_scale_init_value = layer_scale_init_value
+        self.dwconv = layers.DepthwiseConv2D(7,
+                                             padding="same",
+                                             depthwise_initializer=KERNEL_INITIALIZER,
+                                             bias_initializer=BIAS_INITIALIZER,
+                                             name="dwconv")
+        self.norm = layers.LayerNormalization(epsilon=1e-6, name="norm")
+        self.pwconv1 = layers.Dense(4 * dim,
+                                    kernel_initializer=KERNEL_INITIALIZER,
+                                    bias_initializer=BIAS_INITIALIZER,
+                                    name="pwconv1")
+        self.act = layers.Activation("gelu")
+        self.pwconv2 = layers.Dense(dim,
+                                    kernel_initializer=KERNEL_INITIALIZER,
+                                    bias_initializer=BIAS_INITIALIZER,
+                                    name="pwconv2")
+        self.drop_path = layers.Dropout(drop_rate, noise_shape=(None, 1, 1, 1)) if drop_rate > 0 else None
+
+    def build(self, input_shape):
+        if self.layer_scale_init_value > 0:
+            self.gamma = self.add_weight(shape=[input_shape[-1]],
+                                         initializer=initializers.Constant(self.layer_scale_init_value),
+                                         trainable=True,
+                                         dtype=tf.float32,
+                                         name="gamma")
+        else:
+            self.gamma = None
+
+    def call(self, x, training=False):
+        shortcut = x
+        x = self.dwconv(x)
+        x = self.norm(x, training=training)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+
+        if self.gamma is not None:
+            x = self.gamma * x
+
+        if self.drop_path is not None:
+            x = self.drop_path(x, training=training)
+
+        return shortcut + x
+
+
+class Stem(layers.Layer):
+    def __init__(self, dim, name: str = None):
+        super().__init__(name=name)
+        self.conv = layers.Conv2D(dim,
+                                  kernel_size=4,
+                                  strides=4,
+                                  padding="same",
+                                  kernel_initializer=KERNEL_INITIALIZER,
+                                  bias_initializer=BIAS_INITIALIZER,
+                                  name="conv2d")
+        self.norm = layers.LayerNormalization(epsilon=1e-6, name="norm")
+
+    def call(self, x, training=False):
+        x = self.conv(x)
+        x = self.norm(x, training=training)
+        return x
+
+
+class DownSample(layers.Layer):
+    def __init__(self, dim, name: str = None):
+        super().__init__(name=name)
+        self.norm = layers.LayerNormalization(epsilon=1e-6, name="norm")
+        self.conv = layers.Conv2D(dim,
+                                  kernel_size=2,
+                                  strides=2,
+                                  padding="same",
+                                  kernel_initializer=KERNEL_INITIALIZER,
+                                  bias_initializer=BIAS_INITIALIZER,
+                                  name="conv2d")
+
+    def call(self, x, training=False):
+        x = self.norm(x, training=training)
+        x = self.conv(x)
+        return x
+
+
+class ConvNeXt(Model):
+    r""" ConvNeXt
+        A Tensorflow impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, num_classes: int, depths: list, dims: list, drop_path_rate: float = 0.,
+                 layer_scale_init_value: float = 1e-6):
+        super().__init__()
+        self.stem = Stem(dims[0], name="stem")
+
+        cur = 0
+        dp_rates = np.linspace(start=0, stop=drop_path_rate, num=sum(depths))
+        self.stage1 = [Block(dim=dims[0],
+                             drop_rate=dp_rates[cur + i],
+                             layer_scale_init_value=layer_scale_init_value,
+                             name=f"stage1_block{i}")
+                       for i in range(depths[0])]
+        cur += depths[0]
+
+        self.downsample2 = DownSample(dims[1], name="downsample2")
+        self.stage2 = [Block(dim=dims[1],
+                             drop_rate=dp_rates[cur + i],
+                             layer_scale_init_value=layer_scale_init_value,
+                             name=f"stage2_block{i}")
+                       for i in range(depths[1])]
+        cur += depths[1]
+
+        self.downsample3 = DownSample(dims[2], name="downsample3")
+        self.stage3 = [Block(dim=dims[2],
+                             drop_rate=dp_rates[cur + i],
+                             layer_scale_init_value=layer_scale_init_value,
+                             name=f"stage3_block{i}")
+                       for i in range(depths[2])]
+        cur += depths[2]
+
+        self.downsample4 = DownSample(dims[3], name="downsample4")
+        self.stage4 = [Block(dim=dims[3],
+                             drop_rate=dp_rates[cur + i],
+                             layer_scale_init_value=layer_scale_init_value,
+                             name=f"stage4_block{i}")
+                       for i in range(depths[3])]
+
+        self.norm = layers.LayerNormalization(epsilon=1e-6, name="norm")
+        self.head = layers.Dense(units=num_classes,
+                                 kernel_initializer=KERNEL_INITIALIZER,
+                                 bias_initializer=BIAS_INITIALIZER,
+                                 name="head")
+
+    def call(self, x, training=False):
+        x = self.stem(x, training=training)
+        for block in self.stage1:
+            x = block(x, training=training)
+
+        x = self.downsample2(x, training=training)
+        for block in self.stage2:
+            x = block(x, training=training)
+
+        x = self.downsample3(x, training=training)
+        for block in self.stage3:
+            x = block(x, training=training)
+
+        x = self.downsample4(x, training=training)
+        for block in self.stage4:
+            x = block(x, training=training)
+
+        x = tf.reduce_mean(x, axis=[1, 2])
+        x = self.norm(x, training=training)
+        x = self.head(x)
+        return x
+
+
+def convnext_tiny(num_classes: int):
+    model = ConvNeXt(depths=[3, 3, 9, 3],
+                     dims=[96, 192, 384, 768],
+                     num_classes=num_classes)
+    return model
+
+
+def convnext_small(num_classes: int):
+    model = ConvNeXt(depths=[3, 3, 27, 3],
+                     dims=[96, 192, 384, 768],
+                     num_classes=num_classes)
+    return model
+
+
+def convnext_base(num_classes: int):
+    model = ConvNeXt(depths=[3, 3, 27, 3],
+                     dims=[128, 256, 512, 1024],
+                     num_classes=num_classes)
+    return model
+
+
+def convnext_large(num_classes: int):
+    model = ConvNeXt(depths=[3, 3, 27, 3],
+                     dims=[192, 384, 768, 1536],
+                     num_classes=num_classes)
+    return model
+
+
+def convnext_xlarge(num_classes: int):
+    model = ConvNeXt(depths=[3, 3, 27, 3],
+                     dims=[256, 512, 1024, 2048],
+                     num_classes=num_classes)
+    return model
diff --git a/tensorflow_classification/ConvNeXt/predict.py b/tensorflow_classification/ConvNeXt/predict.py
new file mode 100644
index 000000000..269f509fd
--- /dev/null
+++ b/tensorflow_classification/ConvNeXt/predict.py
@@ -0,0 +1,63 @@
+import os
+import json
+import glob
+import numpy as np
+
+from PIL import Image
+import tensorflow as tf
+import matplotlib.pyplot as plt
+
+from model import convnext_tiny as create_model
+
+
+def main():
+    num_classes = 5
+    im_height = im_width = 224
+
+    # load image
+    img_path = "../tulip.jpg"
+    assert os.path.exists(img_path), "file: '{}' dose not exist.".format(img_path)
+    img = Image.open(img_path)
+    # resize image
+    img = img.resize((im_width, im_height))
+    plt.imshow(img)
+
+    # read image
+    img = np.array(img).astype(np.float32)
+
+    # preprocess
+    img = (img / 255. - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
+
+    # Add the image to a batch where it's the only member.
+    img = (np.expand_dims(img, 0))
+
+    # read class_indict
+    json_path = './class_indices.json'
+    assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
+
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
+
+    # create model
+    model = create_model(num_classes=num_classes)
+    model.build([1, 224, 224, 3])
+
+    weights_path = './save_weights/model.ckpt'
+    assert len(glob.glob(weights_path+"*")), "cannot find {}".format(weights_path)
+    model.load_weights(weights_path)
+
+    result = np.squeeze(model.predict(img, batch_size=1))
+    result = tf.keras.layers.Softmax()(result)
+    predict_class = np.argmax(result)
+
+    print_res = "class: {}   prob: {:.3}".format(class_indict[str(predict_class)],
+                                                 result[predict_class])
+    plt.title(print_res)
+    for i in range(len(result)):
+        print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
+                                                  result[i]))
+    plt.show()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tensorflow_classification/ConvNeXt/train.py b/tensorflow_classification/ConvNeXt/train.py
new file mode 100644
index 000000000..b2cf77248
--- /dev/null
+++ b/tensorflow_classification/ConvNeXt/train.py
@@ -0,0 +1,150 @@
+import os
+import re
+import sys
+import datetime
+
+import tensorflow as tf
+from tqdm import tqdm
+
+from model import convnext_tiny as create_model
+from utils import generate_ds, cosine_scheduler
+
+assert tf.version.VERSION >= "2.4.0", "version of tf must greater/equal than 2.4.0"
+
+
+def main():
+    data_root = "/data/flower_photos"  # get data root path
+
+    if not os.path.exists("./save_weights"):
+        os.makedirs("./save_weights")
+
+    batch_size = 8
+    epochs = 10
+    num_classes = 5
+    freeze_layers = False
+    initial_lr = 0.005
+    weight_decay = 5e-4
+
+    log_dir = "./logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    train_writer = tf.summary.create_file_writer(os.path.join(log_dir, "train"))
+    val_writer = tf.summary.create_file_writer(os.path.join(log_dir, "val"))
+
+    # data generator with data augmentation
+    train_ds, val_ds = generate_ds(data_root, batch_size=batch_size, val_rate=0.2)
+
+    # create model
+    model = create_model(num_classes=num_classes)
+    model.build((1, 224, 224, 3))
+
+    # 下载我提前转好的预训练权重
+    # 链接: https://pan.baidu.com/s/1MtYJ3FCAkiPwaMRKuyZN1Q  密码: 1cgp
+    # load weights
+    pre_weights_path = './convnext_tiny_1k_224.h5'
+    assert os.path.exists(pre_weights_path), "cannot find {}".format(pre_weights_path)
+    model.load_weights(pre_weights_path, by_name=True, skip_mismatch=True)
+
+    # freeze bottom layers
+    if freeze_layers:
+        for layer in model.layers:
+            if "head" not in layer.name:
+                layer.trainable = False
+            else:
+                print("training {}".format(layer.name))
+
+    model.summary()
+
+    # custom learning rate scheduler
+    scheduler = cosine_scheduler(initial_lr, epochs, len(train_ds), train_writer=train_writer)
+
+    # using keras low level api for training
+    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    optimizer = tf.keras.optimizers.SGD(learning_rate=initial_lr, momentum=0.9)
+
+    train_loss = tf.keras.metrics.Mean(name='train_loss')
+    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
+
+    val_loss = tf.keras.metrics.Mean(name='val_loss')
+    val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')
+
+    @tf.function
+    def train_step(train_images, train_labels):
+        with tf.GradientTape() as tape:
+            output = model(train_images, training=True)
+            ce_loss = loss_object(train_labels, output)
+
+            # l2 loss
+            matcher = re.compile(".*(bias|gamma|beta).*")
+            l2loss = weight_decay * tf.add_n([
+                tf.nn.l2_loss(v)
+                for v in model.trainable_variables
+                if not matcher.match(v.name)
+            ])
+
+            loss = ce_loss + l2loss
+
+        gradients = tape.gradient(loss, model.trainable_variables)
+        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+        train_loss(ce_loss)
+        train_accuracy(train_labels, output)
+
+    @tf.function
+    def val_step(val_images, val_labels):
+        output = model(val_images, training=False)
+        loss = loss_object(val_labels, output)
+
+        val_loss(loss)
+        val_accuracy(val_labels, output)
+
+    best_val_acc = 0.
+    for epoch in range(epochs):
+        train_loss.reset_states()  # clear history info
+        train_accuracy.reset_states()  # clear history info
+        val_loss.reset_states()  # clear history info
+        val_accuracy.reset_states()  # clear history info
+
+        # train
+        train_bar = tqdm(train_ds, file=sys.stdout)
+        for images, labels in train_bar:
+            # update learning rate
+            optimizer.learning_rate = next(scheduler)
+
+            train_step(images, labels)
+
+            # print train process
+            train_bar.desc = "train epoch[{}/{}] loss:{:.3f}, acc:{:.3f}, lr:{:.5f}".format(
+                epoch + 1,
+                epochs,
+                train_loss.result(),
+                train_accuracy.result(),
+                optimizer.learning_rate.numpy()
+            )
+
+        # validate
+        val_bar = tqdm(val_ds, file=sys.stdout)
+        for images, labels in val_bar:
+            val_step(images, labels)
+
+            # print val process
+            val_bar.desc = "valid epoch[{}/{}] loss:{:.3f}, acc:{:.3f}".format(epoch + 1,
+                                                                               epochs,
+                                                                               val_loss.result(),
+                                                                               val_accuracy.result())
+        # writing training loss and acc
+        with train_writer.as_default():
+            tf.summary.scalar("loss", train_loss.result(), epoch)
+            tf.summary.scalar("accuracy", train_accuracy.result(), epoch)
+
+        # writing validation loss and acc
+        with val_writer.as_default():
+            tf.summary.scalar("loss", val_loss.result(), epoch)
+            tf.summary.scalar("accuracy", val_accuracy.result(), epoch)
+
+        # only save best weights
+        if val_accuracy.result() > best_val_acc:
+            best_val_acc = val_accuracy.result()
+            save_name = "./save_weights/model.ckpt"
+            model.save_weights(save_name, save_format="tf")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tensorflow_classification/ConvNeXt/trans_weights.py b/tensorflow_classification/ConvNeXt/trans_weights.py
new file mode 100644
index 000000000..a35b1cc2c
--- /dev/null
+++ b/tensorflow_classification/ConvNeXt/trans_weights.py
@@ -0,0 +1,149 @@
+import torch
+from model import *
+
+
+def transpose_weights(m_type, w_dict, k, v):
+    if m_type == "conv":
+        if len(v.shape) > 1:
+            # conv weights
+            v = np.transpose(v.numpy(), (2, 3, 1, 0)).astype(np.float32)
+        w_dict[k] = v
+    elif m_type == "dwconv":
+        if len(v.shape) > 1:
+            # dwconv weights
+            v = np.transpose(v.numpy(), (2, 3, 0, 1)).astype(np.float32)
+        w_dict[k] = v
+    elif m_type == "linear":
+        if len(v.shape) > 1:
+            v = np.transpose(v.numpy(), (1, 0)).astype(np.float32)
+        w_dict[k] = v
+    elif m_type == "norm":
+        w_dict[k] = v
+    else:
+        ValueError(f"not support type:{m_type}")
+
+
+def main(weights_path: str,
+         model_name: str,
+         model: tf.keras.Model):
+    var_dict = {v.name.split(':')[0]: v for v in model.weights}
+
+    weights_dict = torch.load(weights_path, map_location="cpu")["model"]
+    w_dict = {}
+    for k, v in weights_dict.items():
+        if "downsample_layers" in k:
+            split_k = k.split(".")
+            if split_k[1] == "0":
+                if split_k[2] == "0":
+                    k = "stem/conv2d/" + split_k[-1]
+                    k = k.replace("weight", "kernel")
+                    transpose_weights("conv", w_dict, k, v)
+                else:
+                    k = "stem/norm/" + split_k[-1]
+                    k = k.replace("weight", "gamma")
+                    k = k.replace("bias", "beta")
+                    transpose_weights("norm", w_dict, k, v)
+            else:
+                stage = int(split_k[1]) + 1
+                if split_k[2] == "1":
+                    k = f"downsample{stage}/conv2d/" + split_k[-1]
+                    k = k.replace("weight", "kernel")
+                    transpose_weights("conv", w_dict, k, v)
+                else:
+                    k = f"downsample{stage}/norm/" + split_k[-1]
+                    k = k.replace("weight", "gamma")
+                    k = k.replace("bias", "beta")
+                    transpose_weights("norm", w_dict, k, v)
+        elif "stages" in k:
+            split_k = k.split(".")
+            stage = int(split_k[1]) + 1
+            block = int(split_k[2])
+            if "dwconv" in k:
+                k = f"stage{stage}_block{block}/{split_k[-2]}/{split_k[-1]}"
+                k = k.replace("weight", "depthwise_kernel")
+                transpose_weights("dwconv", w_dict, k, v)
+            elif "pwconv" in k:
+                k = f"stage{stage}_block{block}/{split_k[-2]}/{split_k[-1]}"
+                k = k.replace("weight", "kernel")
+                transpose_weights("linear", w_dict, k, v)
+            elif "norm" in k:
+                k = f"stage{stage}_block{block}/{split_k[-2]}/{split_k[-1]}"
+                k = k.replace("weight", "gamma")
+                k = k.replace("bias", "beta")
+                transpose_weights("norm", w_dict, k, v)
+            elif "gamma" in k:
+                k = f"stage{stage}_block{block}/{split_k[-1]}"
+                transpose_weights("norm", w_dict, k, v)
+            else:
+                ValueError(f"unrecognized {k}")
+        elif "norm" in k:
+            split_k = k.split(".")
+            k = f"norm/{split_k[-1]}"
+            k = k.replace("weight", "gamma")
+            k = k.replace("bias", "beta")
+            transpose_weights("norm", w_dict, k, v)
+        elif "head" in k:
+            split_k = k.split(".")
+            k = f"head/{split_k[-1]}"
+            k = k.replace("weight", "kernel")
+            transpose_weights("linear", w_dict, k, v)
+        else:
+            ValueError(f"unrecognized {k}")
+
+    for key, var in var_dict.items():
+        if key in w_dict:
+            if w_dict[key].shape != var.shape:
+                msg = "shape mismatch: {}".format(key)
+                print(msg)
+            else:
+                var.assign(w_dict[key], read_value=False)
+        else:
+            msg = "Not found {} in {}".format(key, weights_path)
+            print(msg)
+
+    model.save_weights("./{}.h5".format(model_name))
+
+
+if __name__ == '__main__':
+    model = convnext_tiny(num_classes=1000)
+    model.build((1, 224, 224, 3))
+    # https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth
+    main(weights_path="./convnext_tiny_1k_224_ema.pth",
+         model_name="convnext_tiny_1k_224",
+         model=model)
+
+    # model = convnext_small(num_classes=1000)
+    # model.build((1, 224, 224, 3))
+    # # https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth
+    # main(weights_path="./convnext_small_1k_224_ema.pth",
+    #      model_name="convnext_small_1k_224",
+    #      model=model)
+
+    # model = convnext_base(num_classes=1000)
+    # model.build((1, 224, 224, 3))
+    # # https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth
+    # main(weights_path="./convnext_base_1k_224_ema.pth",
+    #      model_name="convnext_base_1k_224",
+    #      model=model)
+
+    # model = convnext_base(num_classes=21841)
+    # model.build((1, 224, 224, 3))
+    # # https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth
+    # main(weights_path="./convnext_base_22k_224.pth",
+    #      model_name="convnext_base_22k_224",
+    #      model=model)
+
+    # model = convnext_large(num_classes=1000)
+    # model.build((1, 224, 224, 3))
+    # # https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth
+    # main(weights_path="./convnext_large_1k_224_ema.pth",
+    #      model_name="convnext_large_1k_224",
+    #      model=model)
+
+    # model = convnext_large(num_classes=21841)
+    # model.build((1, 224, 224, 3))
+    # # https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth
+    # main(weights_path="./convnext_large_22k_224.pth",
+    #      model_name="convnext_large_22k_224",
+    #      model=model)
+
diff --git a/tensorflow_classification/ConvNeXt/utils.py b/tensorflow_classification/ConvNeXt/utils.py
new file mode 100644
index 000000000..57470b045
--- /dev/null
+++ b/tensorflow_classification/ConvNeXt/utils.py
@@ -0,0 +1,174 @@
+import os
+import json
+import random
+import math
+
+import numpy as np
+import tensorflow as tf
+import matplotlib.pyplot as plt
+
+
+def read_split_data(root: str, val_rate: float = 0.2):
+    random.seed(0)  # 保证随机划分结果一致
+    assert os.path.exists(root), "dataset root: {} does not exist.".format(root)
+
+    # 遍历文件夹，一个文件夹对应一个类别
+    flower_class = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]
+    # 排序，保证顺序一致
+    flower_class.sort()
+    # 生成类别名称以及对应的数字索引
+    class_indices = dict((k, v) for v, k in enumerate(flower_class))
+    json_str = json.dumps(dict((val, key) for key, val in class_indices.items()), indent=4)
+    with open('class_indices.json', 'w') as json_file:
+        json_file.write(json_str)
+
+    train_images_path = []  # 存储训练集的所有图片路径
+    train_images_label = []  # 存储训练集图片对应索引信息
+    val_images_path = []  # 存储验证集的所有图片路径
+    val_images_label = []  # 存储验证集图片对应索引信息
+    every_class_num = []  # 存储每个类别的样本总数
+    supported = [".jpg", ".JPG", ".jpeg", ".JPEG"]  # 支持的文件后缀类型
+    # 遍历每个文件夹下的文件
+    for cla in flower_class:
+        cla_path = os.path.join(root, cla)
+        # 遍历获取supported支持的所有文件路径
+        images = [os.path.join(root, cla, i) for i in os.listdir(cla_path)
+                  if os.path.splitext(i)[-1] in supported]
+        # 获取该类别对应的索引
+        image_class = class_indices[cla]
+        # 记录该类别的样本数量
+        every_class_num.append(len(images))
+        # 按比例随机采样验证样本
+        val_path = random.sample(images, k=int(len(images) * val_rate))
+
+        for img_path in images:
+            if img_path in val_path:  # 如果该路径在采样的验证集样本中则存入验证集
+                val_images_path.append(img_path)
+                val_images_label.append(image_class)
+            else:  # 否则存入训练集
+                train_images_path.append(img_path)
+                train_images_label.append(image_class)
+
+    print("{} images were found in the dataset.\n{} for training, {} for validation".format(sum(every_class_num),
+                                                                                            len(train_images_path),
+                                                                                            len(val_images_path)
+                                                                                            ))
+
+    plot_image = False
+    if plot_image:
+        # 绘制每种类别个数柱状图
+        plt.bar(range(len(flower_class)), every_class_num, align='center')
+        # 将横坐标0,1,2,3,4替换为相应的类别名称
+        plt.xticks(range(len(flower_class)), flower_class)
+        # 在柱状图上添加数值标签
+        for i, v in enumerate(every_class_num):
+            plt.text(x=i, y=v + 5, s=str(v), ha='center')
+        # 设置x坐标
+        plt.xlabel('image class')
+        # 设置y坐标
+        plt.ylabel('number of images')
+        # 设置柱状图的标题
+        plt.title('flower class distribution')
+        plt.show()
+
+    return train_images_path, train_images_label, val_images_path, val_images_label
+
+
+def generate_ds(data_root: str,
+                train_im_height: int = 224,
+                train_im_width: int = 224,
+                val_im_height: int = None,
+                val_im_width: int = None,
+                batch_size: int = 8,
+                val_rate: float = 0.1,
+                cache_data: bool = False):
+    """
+    读取划分数据集，并生成训练集和验证集的迭代器
+    :param data_root: 数据根目录
+    :param train_im_height: 训练输入网络图像的高度
+    :param train_im_width:  训练输入网络图像的宽度
+    :param val_im_height: 验证输入网络图像的高度
+    :param val_im_width:  验证输入网络图像的宽度
+    :param batch_size: 训练使用的batch size
+    :param val_rate:  将数据按给定比例划分到验证集
+    :param cache_data: 是否缓存数据
+    :return:
+    """
+    assert train_im_height is not None
+    assert train_im_width is not None
+    if val_im_width is None:
+        val_im_width = train_im_width
+    if val_im_height is None:
+        val_im_height = train_im_height
+
+    train_img_path, train_img_label, val_img_path, val_img_label = read_split_data(data_root, val_rate=val_rate)
+    AUTOTUNE = tf.data.experimental.AUTOTUNE
+
+    def process_train_info(img_path, label):
+        image = tf.io.read_file(img_path)
+        image = tf.image.decode_jpeg(image, channels=3)
+        image = tf.cast(image, tf.float32)
+        image = tf.image.resize_with_crop_or_pad(image, train_im_height, train_im_width)
+        image = tf.image.random_flip_left_right(image)
+        image = (image / 255. - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
+        return image, label
+
+    def process_val_info(img_path, label):
+        image = tf.io.read_file(img_path)
+        image = tf.image.decode_jpeg(image, channels=3)
+        image = tf.cast(image, tf.float32)
+        image = tf.image.resize_with_crop_or_pad(image, val_im_height, val_im_width)
+        image = (image / 255. - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
+        return image, label
+
+    # Configure dataset for performance
+    def configure_for_performance(ds,
+                                  shuffle_size: int,
+                                  shuffle: bool = False,
+                                  cache: bool = False):
+        if cache:
+            ds = ds.cache()  # 读取数据后缓存至内存
+        if shuffle:
+            ds = ds.shuffle(buffer_size=shuffle_size)  # 打乱数据顺序
+        ds = ds.batch(batch_size)                      # 指定batch size
+        ds = ds.prefetch(buffer_size=AUTOTUNE)         # 在训练的同时提前准备下一个step的数据
+        return ds
+
+    train_ds = tf.data.Dataset.from_tensor_slices((tf.constant(train_img_path),
+                                                   tf.constant(train_img_label)))
+    total_train = len(train_img_path)
+
+    # Use Dataset.map to create a dataset of image, label pairs
+    train_ds = train_ds.map(process_train_info, num_parallel_calls=AUTOTUNE)
+    train_ds = configure_for_performance(train_ds, total_train, shuffle=True, cache=cache_data)
+
+    val_ds = tf.data.Dataset.from_tensor_slices((tf.constant(val_img_path),
+                                                 tf.constant(val_img_label)))
+    total_val = len(val_img_path)
+    # Use Dataset.map to create a dataset of image, label pairs
+    val_ds = val_ds.map(process_val_info, num_parallel_calls=AUTOTUNE)
+    val_ds = configure_for_performance(val_ds, total_val, cache=False)
+
+    return train_ds, val_ds
+
+
+def cosine_rate(now_step, total_step, end_lr_rate):
+    rate = ((1 + math.cos(now_step * math.pi / total_step)) / 2) * (1 - end_lr_rate) + end_lr_rate  # cosine
+    return rate
+
+
+def cosine_scheduler(initial_lr, epochs, steps, warmup_epochs=1, end_lr_rate=1e-6, train_writer=None):
+    """custom learning rate scheduler"""
+    assert warmup_epochs < epochs
+    warmup = np.linspace(start=1e-8, stop=initial_lr, num=warmup_epochs*steps)
+    remainder_steps = (epochs - warmup_epochs) * steps
+    cosine = initial_lr * np.array([cosine_rate(i, remainder_steps, end_lr_rate) for i in range(remainder_steps)])
+    lr_list = np.concatenate([warmup, cosine])
+
+    for i in range(len(lr_list)):
+        new_lr = lr_list[i]
+        if train_writer is not None:
+            # writing lr into tensorboard
+            with train_writer.as_default():
+                tf.summary.scalar('learning rate', data=new_lr, step=i)
+        yield new_lr
diff --git a/tensorflow_classification/Test11_efficientnetV2/predict.py b/tensorflow_classification/Test11_efficientnetV2/predict.py
index dec912667..27476c45f 100644
--- a/tensorflow_classification/Test11_efficientnetV2/predict.py
+++ b/tensorflow_classification/Test11_efficientnetV2/predict.py
@@ -40,8 +40,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_model(num_classes=num_classes)
@@ -59,7 +59,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/Test2_alexnet/predict.py b/tensorflow_classification/Test2_alexnet/predict.py
index bd4401359..59fd66496 100644
--- a/tensorflow_classification/Test2_alexnet/predict.py
+++ b/tensorflow_classification/Test2_alexnet/predict.py
@@ -31,8 +31,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = AlexNet_v1(num_classes=5)
@@ -49,7 +49,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/Test3_vgg/predict.py b/tensorflow_classification/Test3_vgg/predict.py
index c060f90a3..3cb4f0dcc 100644
--- a/tensorflow_classification/Test3_vgg/predict.py
+++ b/tensorflow_classification/Test3_vgg/predict.py
@@ -31,8 +31,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = vgg("vgg16", im_height=im_height, im_width=im_width, num_classes=num_classes)
@@ -49,7 +49,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/Test4_goolenet/predict.py b/tensorflow_classification/Test4_goolenet/predict.py
index ee9ed521e..a74a07cbc 100644
--- a/tensorflow_classification/Test4_goolenet/predict.py
+++ b/tensorflow_classification/Test4_goolenet/predict.py
@@ -31,8 +31,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     model = GoogLeNet(class_num=5, aux_logits=False)
     model.summary()
@@ -49,7 +49,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/Test5_resnet/predict.py b/tensorflow_classification/Test5_resnet/predict.py
index 2939f6362..9cb0df536 100644
--- a/tensorflow_classification/Test5_resnet/predict.py
+++ b/tensorflow_classification/Test5_resnet/predict.py
@@ -37,8 +37,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     feature = resnet50(num_classes=num_classes, include_top=False)
@@ -65,7 +65,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/Test6_mobilenet/predict.py b/tensorflow_classification/Test6_mobilenet/predict.py
index c98619ffb..9ba39cc86 100644
--- a/tensorflow_classification/Test6_mobilenet/predict.py
+++ b/tensorflow_classification/Test6_mobilenet/predict.py
@@ -34,8 +34,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     feature = MobileNetV2(include_top=False)
@@ -56,7 +56,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/Test7_shuffleNet/predict.py b/tensorflow_classification/Test7_shuffleNet/predict.py
index 48a4f6751..4ede6789b 100644
--- a/tensorflow_classification/Test7_shuffleNet/predict.py
+++ b/tensorflow_classification/Test7_shuffleNet/predict.py
@@ -36,8 +36,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = shufflenet_v2_x1_0(num_classes=num_classes)
@@ -54,7 +54,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/Test9_efficientNet/predict.py b/tensorflow_classification/Test9_efficientNet/predict.py
index 3897e5591..632a202b1 100644
--- a/tensorflow_classification/Test9_efficientNet/predict.py
+++ b/tensorflow_classification/Test9_efficientNet/predict.py
@@ -41,8 +41,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_model(num_classes=num_classes)
@@ -59,7 +59,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/swin_transformer/predict.py b/tensorflow_classification/swin_transformer/predict.py
index e5e0ae545..95e3fc892 100644
--- a/tensorflow_classification/swin_transformer/predict.py
+++ b/tensorflow_classification/swin_transformer/predict.py
@@ -35,8 +35,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_model(num_classes=num_classes)
@@ -55,7 +55,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()
 
 
diff --git a/tensorflow_classification/vision_transformer/predict.py b/tensorflow_classification/vision_transformer/predict.py
index 49c4c462f..95e803064 100755
--- a/tensorflow_classification/vision_transformer/predict.py
+++ b/tensorflow_classification/vision_transformer/predict.py
@@ -35,8 +35,8 @@ def main():
     json_path = './class_indices.json'
     assert os.path.exists(json_path), "file: '{}' dose not exist.".format(json_path)
 
-    json_file = open(json_path, "r")
-    class_indict = json.load(json_file)
+    with open(json_path, "r") as f:
+        class_indict = json.load(f)
 
     # create model
     model = create_model(num_classes=num_classes, has_logits=False)
@@ -55,7 +55,7 @@ def main():
     plt.title(print_res)
     for i in range(len(result)):
         print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
-                                                  result[i].numpy()))
+                                                  result[i]))
     plt.show()