diff options
Diffstat (limited to 'openvz-sources/026.009/0100_patch-026test009-core.patch')
-rw-r--r-- | openvz-sources/026.009/0100_patch-026test009-core.patch | 76411 |
1 files changed, 0 insertions, 76411 deletions
diff --git a/openvz-sources/026.009/0100_patch-026test009-core.patch b/openvz-sources/026.009/0100_patch-026test009-core.patch deleted file mode 100644 index de10b67..0000000 --- a/openvz-sources/026.009/0100_patch-026test009-core.patch +++ /dev/null @@ -1,76411 +0,0 @@ -diff -upr linux-2.6.16.orig/COPYING.SWsoft linux-2.6.16-026test009/COPYING.SWsoft ---- linux-2.6.16.orig/COPYING.SWsoft 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/COPYING.SWsoft 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,350 @@ -+ -+Nothing in this license should be construed as a grant by SWsoft of any rights -+beyond the rights specified in the GNU General Public License, and nothing in -+this license should be construed as a waiver by SWsoft of its patent, copyright -+and/or trademark rights, beyond the waiver required by the GNU General Public -+License. This license is expressly inapplicable to any product that is not -+within the scope of the GNU General Public License -+ -+---------------------------------------- -+ -+ GNU GENERAL PUBLIC LICENSE -+ Version 2, June 1991 -+ -+ Copyright (C) 1989, 1991 Free Software Foundation, Inc. -+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ Everyone is permitted to copy and distribute verbatim copies -+ of this license document, but changing it is not allowed. -+ -+ Preamble -+ -+ The licenses for most software are designed to take away your -+freedom to share and change it. By contrast, the GNU General Public -+License is intended to guarantee your freedom to share and change free -+software--to make sure the software is free for all its users. This -+General Public License applies to most of the Free Software -+Foundation's software and to any other program whose authors commit to -+using it. (Some other Free Software Foundation software is covered by -+the GNU Library General Public License instead.) You can apply it to -+your programs, too. -+ -+ When we speak of free software, we are referring to freedom, not -+price. Our General Public Licenses are designed to make sure that you -+have the freedom to distribute copies of free software (and charge for -+this service if you wish), that you receive source code or can get it -+if you want it, that you can change the software or use pieces of it -+in new free programs; and that you know you can do these things. -+ -+ To protect your rights, we need to make restrictions that forbid -+anyone to deny you these rights or to ask you to surrender the rights. -+These restrictions translate to certain responsibilities for you if you -+distribute copies of the software, or if you modify it. -+ -+ For example, if you distribute copies of such a program, whether -+gratis or for a fee, you must give the recipients all the rights that -+you have. You must make sure that they, too, receive or can get the -+source code. And you must show them these terms so they know their -+rights. -+ -+ We protect your rights with two steps: (1) copyright the software, and -+(2) offer you this license which gives you legal permission to copy, -+distribute and/or modify the software. -+ -+ Also, for each author's protection and ours, we want to make certain -+that everyone understands that there is no warranty for this free -+software. If the software is modified by someone else and passed on, we -+want its recipients to know that what they have is not the original, so -+that any problems introduced by others will not reflect on the original -+authors' reputations. -+ -+ Finally, any free program is threatened constantly by software -+patents. We wish to avoid the danger that redistributors of a free -+program will individually obtain patent licenses, in effect making the -+program proprietary. To prevent this, we have made it clear that any -+patent must be licensed for everyone's free use or not licensed at all. -+ -+ The precise terms and conditions for copying, distribution and -+modification follow. -+ -+ GNU GENERAL PUBLIC LICENSE -+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION -+ -+ 0. This License applies to any program or other work which contains -+a notice placed by the copyright holder saying it may be distributed -+under the terms of this General Public License. The "Program", below, -+refers to any such program or work, and a "work based on the Program" -+means either the Program or any derivative work under copyright law: -+that is to say, a work containing the Program or a portion of it, -+either verbatim or with modifications and/or translated into another -+language. (Hereinafter, translation is included without limitation in -+the term "modification".) Each licensee is addressed as "you". -+ -+Activities other than copying, distribution and modification are not -+covered by this License; they are outside its scope. The act of -+running the Program is not restricted, and the output from the Program -+is covered only if its contents constitute a work based on the -+Program (independent of having been made by running the Program). -+Whether that is true depends on what the Program does. -+ -+ 1. You may copy and distribute verbatim copies of the Program's -+source code as you receive it, in any medium, provided that you -+conspicuously and appropriately publish on each copy an appropriate -+copyright notice and disclaimer of warranty; keep intact all the -+notices that refer to this License and to the absence of any warranty; -+and give any other recipients of the Program a copy of this License -+along with the Program. -+ -+You may charge a fee for the physical act of transferring a copy, and -+you may at your option offer warranty protection in exchange for a fee. -+ -+ 2. You may modify your copy or copies of the Program or any portion -+of it, thus forming a work based on the Program, and copy and -+distribute such modifications or work under the terms of Section 1 -+above, provided that you also meet all of these conditions: -+ -+ a) You must cause the modified files to carry prominent notices -+ stating that you changed the files and the date of any change. -+ -+ b) You must cause any work that you distribute or publish, that in -+ whole or in part contains or is derived from the Program or any -+ part thereof, to be licensed as a whole at no charge to all third -+ parties under the terms of this License. -+ -+ c) If the modified program normally reads commands interactively -+ when run, you must cause it, when started running for such -+ interactive use in the most ordinary way, to print or display an -+ announcement including an appropriate copyright notice and a -+ notice that there is no warranty (or else, saying that you provide -+ a warranty) and that users may redistribute the program under -+ these conditions, and telling the user how to view a copy of this -+ License. (Exception: if the Program itself is interactive but -+ does not normally print such an announcement, your work based on -+ the Program is not required to print an announcement.) -+ -+These requirements apply to the modified work as a whole. If -+identifiable sections of that work are not derived from the Program, -+and can be reasonably considered independent and separate works in -+themselves, then this License, and its terms, do not apply to those -+sections when you distribute them as separate works. But when you -+distribute the same sections as part of a whole which is a work based -+on the Program, the distribution of the whole must be on the terms of -+this License, whose permissions for other licensees extend to the -+entire whole, and thus to each and every part regardless of who wrote it. -+ -+Thus, it is not the intent of this section to claim rights or contest -+your rights to work written entirely by you; rather, the intent is to -+exercise the right to control the distribution of derivative or -+collective works based on the Program. -+ -+In addition, mere aggregation of another work not based on the Program -+with the Program (or with a work based on the Program) on a volume of -+a storage or distribution medium does not bring the other work under -+the scope of this License. -+ -+ 3. You may copy and distribute the Program (or a work based on it, -+under Section 2) in object code or executable form under the terms of -+Sections 1 and 2 above provided that you also do one of the following: -+ -+ a) Accompany it with the complete corresponding machine-readable -+ source code, which must be distributed under the terms of Sections -+ 1 and 2 above on a medium customarily used for software interchange; or, -+ -+ b) Accompany it with a written offer, valid for at least three -+ years, to give any third party, for a charge no more than your -+ cost of physically performing source distribution, a complete -+ machine-readable copy of the corresponding source code, to be -+ distributed under the terms of Sections 1 and 2 above on a medium -+ customarily used for software interchange; or, -+ -+ c) Accompany it with the information you received as to the offer -+ to distribute corresponding source code. (This alternative is -+ allowed only for noncommercial distribution and only if you -+ received the program in object code or executable form with such -+ an offer, in accord with Subsection b above.) -+ -+The source code for a work means the preferred form of the work for -+making modifications to it. For an executable work, complete source -+code means all the source code for all modules it contains, plus any -+associated interface definition files, plus the scripts used to -+control compilation and installation of the executable. However, as a -+special exception, the source code distributed need not include -+anything that is normally distributed (in either source or binary -+form) with the major components (compiler, kernel, and so on) of the -+operating system on which the executable runs, unless that component -+itself accompanies the executable. -+ -+If distribution of executable or object code is made by offering -+access to copy from a designated place, then offering equivalent -+access to copy the source code from the same place counts as -+distribution of the source code, even though third parties are not -+compelled to copy the source along with the object code. -+ -+ 4. You may not copy, modify, sublicense, or distribute the Program -+except as expressly provided under this License. Any attempt -+otherwise to copy, modify, sublicense or distribute the Program is -+void, and will automatically terminate your rights under this License. -+However, parties who have received copies, or rights, from you under -+this License will not have their licenses terminated so long as such -+parties remain in full compliance. -+ -+ 5. You are not required to accept this License, since you have not -+signed it. However, nothing else grants you permission to modify or -+distribute the Program or its derivative works. These actions are -+prohibited by law if you do not accept this License. Therefore, by -+modifying or distributing the Program (or any work based on the -+Program), you indicate your acceptance of this License to do so, and -+all its terms and conditions for copying, distributing or modifying -+the Program or works based on it. -+ -+ 6. Each time you redistribute the Program (or any work based on the -+Program), the recipient automatically receives a license from the -+original licensor to copy, distribute or modify the Program subject to -+these terms and conditions. You may not impose any further -+restrictions on the recipients' exercise of the rights granted herein. -+You are not responsible for enforcing compliance by third parties to -+this License. -+ -+ 7. If, as a consequence of a court judgment or allegation of patent -+infringement or for any other reason (not limited to patent issues), -+conditions are imposed on you (whether by court order, agreement or -+otherwise) that contradict the conditions of this License, they do not -+excuse you from the conditions of this License. If you cannot -+distribute so as to satisfy simultaneously your obligations under this -+License and any other pertinent obligations, then as a consequence you -+may not distribute the Program at all. For example, if a patent -+license would not permit royalty-free redistribution of the Program by -+all those who receive copies directly or indirectly through you, then -+the only way you could satisfy both it and this License would be to -+refrain entirely from distribution of the Program. -+ -+If any portion of this section is held invalid or unenforceable under -+any particular circumstance, the balance of the section is intended to -+apply and the section as a whole is intended to apply in other -+circumstances. -+ -+It is not the purpose of this section to induce you to infringe any -+patents or other property right claims or to contest validity of any -+such claims; this section has the sole purpose of protecting the -+integrity of the free software distribution system, which is -+implemented by public license practices. Many people have made -+generous contributions to the wide range of software distributed -+through that system in reliance on consistent application of that -+system; it is up to the author/donor to decide if he or she is willing -+to distribute software through any other system and a licensee cannot -+impose that choice. -+ -+This section is intended to make thoroughly clear what is believed to -+be a consequence of the rest of this License. -+ -+ 8. If the distribution and/or use of the Program is restricted in -+certain countries either by patents or by copyrighted interfaces, the -+original copyright holder who places the Program under this License -+may add an explicit geographical distribution limitation excluding -+those countries, so that distribution is permitted only in or among -+countries not thus excluded. In such case, this License incorporates -+the limitation as if written in the body of this License. -+ -+ 9. The Free Software Foundation may publish revised and/or new versions -+of the General Public License from time to time. Such new versions will -+be similar in spirit to the present version, but may differ in detail to -+address new problems or concerns. -+ -+Each version is given a distinguishing version number. If the Program -+specifies a version number of this License which applies to it and "any -+later version", you have the option of following the terms and conditions -+either of that version or of any later version published by the Free -+Software Foundation. If the Program does not specify a version number of -+this License, you may choose any version ever published by the Free Software -+Foundation. -+ -+ 10. If you wish to incorporate parts of the Program into other free -+programs whose distribution conditions are different, write to the author -+to ask for permission. For software which is copyrighted by the Free -+Software Foundation, write to the Free Software Foundation; we sometimes -+make exceptions for this. Our decision will be guided by the two goals -+of preserving the free status of all derivatives of our free software and -+of promoting the sharing and reuse of software generally. -+ -+ NO WARRANTY -+ -+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -+REPAIR OR CORRECTION. -+ -+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -+POSSIBILITY OF SUCH DAMAGES. -+ -+ END OF TERMS AND CONDITIONS -+ -+ How to Apply These Terms to Your New Programs -+ -+ If you develop a new program, and you want it to be of the greatest -+possible use to the public, the best way to achieve this is to make it -+free software which everyone can redistribute and change under these terms. -+ -+ To do so, attach the following notices to the program. It is safest -+to attach them to the start of each source file to most effectively -+convey the exclusion of warranty; and each file should have at least -+the "copyright" line and a pointer to where the full notice is found. -+ -+ <one line to give the program's name and a brief idea of what it does.> -+ Copyright (C) <year> <name of author> -+ -+ This program is free software; you can redistribute it and/or modify -+ it under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 2 of the License, or -+ (at your option) any later version. -+ -+ This program is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software -+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ -+ -+Also add information on how to contact you by electronic and paper mail. -+ -+If the program is interactive, make it output a short notice like this -+when it starts in an interactive mode: -+ -+ Gnomovision version 69, Copyright (C) year name of author -+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. -+ This is free software, and you are welcome to redistribute it -+ under certain conditions; type `show c' for details. -+ -+The hypothetical commands `show w' and `show c' should show the appropriate -+parts of the General Public License. Of course, the commands you use may -+be called something other than `show w' and `show c'; they could even be -+mouse-clicks or menu items--whatever suits your program. -+ -+You should also get your employer (if you work as a programmer) or your -+school, if any, to sign a "copyright disclaimer" for the program, if -+necessary. Here is a sample; alter the names: -+ -+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program -+ `Gnomovision' (which makes passes at compilers) written by James Hacker. -+ -+ <signature of Ty Coon>, 1 April 1989 -+ Ty Coon, President of Vice -+ -+This General Public License does not permit incorporating your program into -+proprietary programs. If your program is a subroutine library, you may -+consider it more useful to permit linking proprietary applications with the -+library. If this is what you want to do, use the GNU Library General -+Public License instead of this License. -diff -upr linux-2.6.16.orig/Makefile linux-2.6.16-026test009/Makefile ---- linux-2.6.16.orig/Makefile 2006-04-19 15:02:13.000000000 +0400 -+++ linux-2.6.16-026test009/Makefile 2006-04-19 15:02:13.000000000 +0400 -@@ -1,7 +1,7 @@ - VERSION = 2 - PATCHLEVEL = 6 - SUBLEVEL = 16 --EXTRAVERSION = -+EXTRAVERSION = -026test009 - NAME=Sliding Snow Leopard - - # *DOCUMENTATION* -diff -upr linux-2.6.16.orig/arch/alpha/kernel/setup.c linux-2.6.16-026test009/arch/alpha/kernel/setup.c ---- linux-2.6.16.orig/arch/alpha/kernel/setup.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/alpha/kernel/setup.c 2006-04-19 15:02:11.000000000 +0400 -@@ -24,6 +24,7 @@ - #include <linux/config.h> /* CONFIG_ALPHA_LCA etc */ - #include <linux/mc146818rtc.h> - #include <linux/console.h> -+#include <linux/cpu.h> - #include <linux/errno.h> - #include <linux/init.h> - #include <linux/string.h> -@@ -477,6 +478,22 @@ page_is_ram(unsigned long pfn) - #undef PFN_PHYS - #undef PFN_MAX - -+static int __init -+register_cpus(void) -+{ -+ int i; -+ -+ for_each_possible_cpu(i) { -+ struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL); -+ if (!p) -+ return -ENOMEM; -+ register_cpu(p, i, NULL); -+ } -+ return 0; -+} -+ -+arch_initcall(register_cpus); -+ - void __init - setup_arch(char **cmdline_p) - { -diff -upr linux-2.6.16.orig/arch/alpha/kernel/smp.c linux-2.6.16-026test009/arch/alpha/kernel/smp.c ---- linux-2.6.16.orig/arch/alpha/kernel/smp.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/alpha/kernel/smp.c 2006-04-19 15:02:11.000000000 +0400 -@@ -439,7 +439,7 @@ setup_smp(void) - if ((cpu->flags & 0x1cc) == 0x1cc) { - smp_num_probed++; - /* Assume here that "whami" == index */ -- cpu_set(i, cpu_possible_map); -+ cpu_set(i, cpu_present_mask); - cpu->pal_revision = boot_cpu_palrev; - } - -@@ -450,9 +450,8 @@ setup_smp(void) - } - } else { - smp_num_probed = 1; -- cpu_set(boot_cpuid, cpu_possible_map); -+ cpu_set(boot_cpuid, cpu_present_mask); - } -- cpu_present_mask = cpumask_of_cpu(boot_cpuid); - - printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_mask = %lx\n", - smp_num_probed, cpu_possible_map.bits[0]); -@@ -488,9 +487,8 @@ void __devinit - smp_prepare_boot_cpu(void) - { - /* -- * Mark the boot cpu (current cpu) as both present and online -+ * Mark the boot cpu (current cpu) as online - */ -- cpu_set(smp_processor_id(), cpu_present_mask); - cpu_set(smp_processor_id(), cpu_online_map); - } - -diff -upr linux-2.6.16.orig/arch/arm/kernel/smp.c linux-2.6.16-026test009/arch/arm/kernel/smp.c ---- linux-2.6.16.orig/arch/arm/kernel/smp.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/arm/kernel/smp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -197,7 +197,7 @@ int __cpuexit __cpu_disable(void) - local_flush_tlb_all(); - - read_lock(&tasklist_lock); -- for_each_process(p) { -+ for_each_process_all(p) { - if (p->mm) - cpu_clear(cpu, p->mm->cpu_vm_mask); - } -diff -upr linux-2.6.16.orig/arch/frv/mm/mmu-context.c linux-2.6.16-026test009/arch/frv/mm/mmu-context.c ---- linux-2.6.16.orig/arch/frv/mm/mmu-context.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/frv/mm/mmu-context.c 2006-04-19 15:02:12.000000000 +0400 -@@ -181,7 +181,7 @@ int cxn_pin_by_pid(pid_t pid) - - /* get a handle on the mm_struct */ - read_lock(&tasklist_lock); -- tsk = find_task_by_pid(pid); -+ tsk = find_task_by_pid_ve(pid); - if (tsk) { - ret = -EINVAL; - -diff -upr linux-2.6.16.orig/arch/i386/Kconfig linux-2.6.16-026test009/arch/i386/Kconfig ---- linux-2.6.16.orig/arch/i386/Kconfig 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/Kconfig 2006-04-19 15:02:12.000000000 +0400 -@@ -1071,12 +1071,16 @@ endmenu - - source "arch/i386/Kconfig.debug" - -+source "kernel/Kconfig.openvz" -+ - source "security/Kconfig" - - source "crypto/Kconfig" - - source "lib/Kconfig" - -+source "kernel/ub/Kconfig" -+ - # - # Use the generic interrupt handling code in kernel/irq/: - # -diff -upr linux-2.6.16.orig/arch/i386/kernel/apic.c linux-2.6.16-026test009/arch/i386/kernel/apic.c ---- linux-2.6.16.orig/arch/i386/kernel/apic.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/apic.c 2006-04-19 15:02:12.000000000 +0400 -@@ -1177,6 +1177,7 @@ inline void smp_local_timer_interrupt(st - fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) - { - int cpu = smp_processor_id(); -+ struct ve_struct *ve; - - /* - * the NMI deadlock-detector uses this. -@@ -1193,9 +1194,11 @@ fastcall void smp_apic_timer_interrupt(s - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ -+ ve = set_exec_env(get_ve0()); - irq_enter(); - smp_local_timer_interrupt(regs); - irq_exit(); -+ (void)set_exec_env(ve); - } - - #ifndef CONFIG_SMP -diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/Kconfig linux-2.6.16-026test009/arch/i386/kernel/cpu/cpufreq/Kconfig ---- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/Kconfig 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/cpu/cpufreq/Kconfig 2006-04-19 15:02:11.000000000 +0400 -@@ -203,6 +203,7 @@ config X86_LONGRUN - config X86_LONGHAUL - tristate "VIA Cyrix III Longhaul" - select CPU_FREQ_TABLE -+ depends on BROKEN - help - This adds the CPUFreq driver for VIA Samuel/CyrixIII, - VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T -diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c linux-2.6.16-026test009/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c ---- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2006-04-19 15:02:11.000000000 +0400 -@@ -244,7 +244,7 @@ static int cpufreq_p4_cpu_init(struct cp - for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if ((i<2) && (has_N44_O17_errata[policy->cpu])) - p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; -- else if (has_N60_errata[policy->cpu] && p4clockmod_table[i].frequency < 2000000) -+ else if (has_N60_errata[policy->cpu] && ((stock_freq * i)/8) < 2000000) - p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; - else - p4clockmod_table[i].frequency = (stock_freq * i)/8; -diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c linux-2.6.16-026test009/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c ---- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c 2006-04-19 15:02:11.000000000 +0400 -@@ -75,7 +75,9 @@ static int speedstep_smi_ownership (void - __asm__ __volatile__( - "out %%al, (%%dx)\n" - : "=D" (result) -- : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic) -+ : "a" (command), "b" (function), "c" (0), "d" (smi_port), -+ "D" (0), "S" (magic) -+ : "memory" - ); - - dprintk("result is %x\n", result); -diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/if.c linux-2.6.16-026test009/arch/i386/kernel/cpu/mtrr/if.c ---- linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/if.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/cpu/mtrr/if.c 2006-04-19 15:02:12.000000000 +0400 -@@ -392,7 +392,7 @@ static int __init mtrr_if_init(void) - return -ENODEV; - - proc_root_mtrr = -- create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root); -+ create_proc_entry("mtrr", S_IWUSR | S_IRUGO, NULL); - if (proc_root_mtrr) { - proc_root_mtrr->owner = THIS_MODULE; - proc_root_mtrr->proc_fops = &mtrr_fops; -diff -upr linux-2.6.16.orig/arch/i386/kernel/dmi_scan.c linux-2.6.16-026test009/arch/i386/kernel/dmi_scan.c ---- linux-2.6.16.orig/arch/i386/kernel/dmi_scan.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/dmi_scan.c 2006-04-19 15:02:11.000000000 +0400 -@@ -106,7 +106,7 @@ static void __init dmi_save_devices(stru - struct dmi_device *dev; - - for (i = 0; i < count; i++) { -- char *d = ((char *) dm) + (i * 2); -+ char *d = (char *)(dm + 1) + (i * 2); - - /* Skip disabled device */ - if ((*d & 0x80) == 0) -diff -upr linux-2.6.16.orig/arch/i386/kernel/irq.c linux-2.6.16-026test009/arch/i386/kernel/irq.c ---- linux-2.6.16.orig/arch/i386/kernel/irq.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/irq.c 2006-04-19 15:02:12.000000000 +0400 -@@ -59,7 +59,9 @@ fastcall unsigned int do_IRQ(struct pt_r - union irq_ctx *curctx, *irqctx; - u32 *isp; - #endif -+ struct ve_struct *ve; - -+ ve = set_exec_env(get_ve0()); - irq_enter(); - #ifdef CONFIG_DEBUG_STACKOVERFLOW - /* Debugging check for stack overflow: is there less than 1KB free? */ -@@ -108,6 +110,7 @@ fastcall unsigned int do_IRQ(struct pt_r - __do_IRQ(irq, regs); - - irq_exit(); -+ (void)set_exec_env(ve); - - return 1; - } -diff -upr linux-2.6.16.orig/arch/i386/kernel/ldt.c linux-2.6.16-026test009/arch/i386/kernel/ldt.c ---- linux-2.6.16.orig/arch/i386/kernel/ldt.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/ldt.c 2006-04-19 15:02:12.000000000 +0400 -@@ -13,6 +13,7 @@ - #include <linux/smp_lock.h> - #include <linux/vmalloc.h> - #include <linux/slab.h> -+#include <linux/module.h> - - #include <asm/uaccess.h> - #include <asm/system.h> -@@ -20,6 +21,8 @@ - #include <asm/desc.h> - #include <asm/mmu_context.h> - -+#include <ub/ub_mem.h> -+ - #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ - static void flush_ldt(void *null) - { -@@ -39,9 +42,9 @@ static int alloc_ldt(mm_context_t *pc, i - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) -- newldt = vmalloc(mincount*LDT_ENTRY_SIZE); -+ newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); - else -- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); -+ newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; -@@ -105,6 +108,7 @@ int init_new_context(struct task_struct - } - return retval; - } -+EXPORT_SYMBOL_GPL(init_new_context); - - /* - * No need to lock the MM as we are the last user -@@ -251,3 +255,5 @@ asmlinkage int sys_modify_ldt(int func, - } - return ret; - } -+ -+EXPORT_SYMBOL_GPL(default_ldt); -diff -upr linux-2.6.16.orig/arch/i386/kernel/nmi.c linux-2.6.16-026test009/arch/i386/kernel/nmi.c ---- linux-2.6.16.orig/arch/i386/kernel/nmi.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/nmi.c 2006-04-19 15:02:11.000000000 +0400 -@@ -521,7 +521,22 @@ void touch_nmi_watchdog (void) - - extern void die_nmi(struct pt_regs *, const char *msg); - --void nmi_watchdog_tick (struct pt_regs * regs) -+void smp_show_regs(struct pt_regs *regs, void *info) -+{ -+ static DEFINE_SPINLOCK(show_regs_lock); -+ -+ if (regs == NULL) -+ return; -+ -+ bust_spinlocks(1); -+ spin_lock(&show_regs_lock); -+ printk("----------- IPI show regs -----------"); -+ show_regs(regs); -+ spin_unlock(&show_regs_lock); -+ bust_spinlocks(0); -+} -+ -+void nmi_watchdog_tick(struct pt_regs *regs) - { - - /* -diff -upr linux-2.6.16.orig/arch/i386/kernel/process.c linux-2.6.16-026test009/arch/i386/kernel/process.c ---- linux-2.6.16.orig/arch/i386/kernel/process.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/process.c 2006-04-19 15:02:12.000000000 +0400 -@@ -59,6 +59,7 @@ - #include <asm/cpu.h> - - asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -+EXPORT_SYMBOL_GPL(ret_from_fork); - - static int hlt_counter; - -@@ -289,11 +290,14 @@ __setup("idle=", idle_setup); - void show_regs(struct pt_regs * regs) - { - unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; -+ extern int die_counter; - - printk("\n"); -- printk("Pid: %d, comm: %20s\n", current->pid, current->comm); -+ printk("Pid: %d, comm: %20s, oopses: %d\n", -+ current->pid, current->comm, die_counter); - printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); -- print_symbol("EIP is at %s\n", regs->eip); -+ if (decode_call_traces) -+ print_symbol("EIP is at %s\n", regs->eip); - - if (user_mode(regs)) - printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); -@@ -314,6 +318,8 @@ void show_regs(struct pt_regs * regs) - cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); - show_trace(NULL, ®s->esp); -+ if (!decode_call_traces) -+ printk(" EIP: [<%08lx>]\n",regs->eip); - } - - /* -@@ -339,6 +345,13 @@ int kernel_thread(int (*fn)(void *), voi - { - struct pt_regs regs; - -+ /* Don't allow kernel_thread() inside VE */ -+ if (!ve_is_super(get_exec_env())) { -+ printk("kernel_thread call inside VE\n"); -+ dump_stack(); -+ return -EPERM; -+ } -+ - memset(®s, 0, sizeof(regs)); - - regs.ebx = (unsigned long) fn; -diff -upr linux-2.6.16.orig/arch/i386/kernel/ptrace.c linux-2.6.16-026test009/arch/i386/kernel/ptrace.c ---- linux-2.6.16.orig/arch/i386/kernel/ptrace.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/ptrace.c 2006-04-19 15:02:12.000000000 +0400 -@@ -706,7 +706,9 @@ int do_syscall_trace(struct pt_regs *reg - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ -+ set_pn_state(current, entryexit ? PN_STOP_LEAVE : PN_STOP_ENTRY); - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); -+ clear_pn_state(current); - - /* - * this isn't the same as continuing with a signal, but it will do -diff -upr linux-2.6.16.orig/arch/i386/kernel/signal.c linux-2.6.16-026test009/arch/i386/kernel/signal.c ---- linux-2.6.16.orig/arch/i386/kernel/signal.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/signal.c 2006-04-19 15:02:11.000000000 +0400 -@@ -582,7 +582,7 @@ static void fastcall do_signal(struct pt - if (!user_mode(regs)) - return; - -- if (try_to_freeze()) -+ if (try_to_freeze() && !signal_pending(current)) - goto no_signal; - - if (test_thread_flag(TIF_RESTORE_SIGMASK)) -diff -upr linux-2.6.16.orig/arch/i386/kernel/smp.c linux-2.6.16-026test009/arch/i386/kernel/smp.c ---- linux-2.6.16.orig/arch/i386/kernel/smp.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/smp.c 2006-04-19 15:02:11.000000000 +0400 -@@ -21,6 +21,7 @@ - #include <linux/cpu.h> - #include <linux/module.h> - -+#include <asm/nmi.h> - #include <asm/mtrr.h> - #include <asm/tlbflush.h> - #include <mach_apic.h> -@@ -566,6 +567,89 @@ int smp_call_function (void (*func) (voi - } - EXPORT_SYMBOL(smp_call_function); - -+static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED; -+static struct nmi_call_data_struct { -+ smp_nmi_function func; -+ void *info; -+ atomic_t started; -+ atomic_t finished; -+ cpumask_t cpus_called; -+ int wait; -+} *nmi_call_data; -+ -+static int smp_nmi_callback(struct pt_regs * regs, int cpu) -+{ -+ smp_nmi_function func; -+ void *info; -+ int wait; -+ -+ func = nmi_call_data->func; -+ info = nmi_call_data->info; -+ wait = nmi_call_data->wait; -+ ack_APIC_irq(); -+ /* prevent from calling func() multiple times */ -+ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) -+ return 0; -+ /* -+ * notify initiating CPU that I've grabbed the data and am -+ * about to execute the function -+ */ -+ mb(); -+ atomic_inc(&nmi_call_data->started); -+ /* at this point the nmi_call_data structure is out of scope */ -+ irq_enter(); -+ func(regs, info); -+ irq_exit(); -+ if (wait) -+ atomic_inc(&nmi_call_data->finished); -+ -+ return 0; -+} -+ -+/* -+ * This function tries to call func(regs, info) on each cpu. -+ * Func must be fast and non-blocking. -+ * May be called with disabled interrupts and from any context. -+ */ -+int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) -+{ -+ struct nmi_call_data_struct data; -+ int cpus; -+ -+ cpus = num_online_cpus() - 1; -+ if (!cpus) -+ return 0; -+ -+ data.func = func; -+ data.info = info; -+ data.wait = wait; -+ atomic_set(&data.started, 0); -+ atomic_set(&data.finished, 0); -+ cpus_clear(data.cpus_called); -+ /* prevent this cpu from calling func if NMI happens */ -+ cpu_set(smp_processor_id(), data.cpus_called); -+ -+ if (!spin_trylock(&nmi_call_lock)) -+ return -1; -+ -+ nmi_call_data = &data; -+ set_nmi_ipi_callback(smp_nmi_callback); -+ mb(); -+ -+ /* Send a message to all other CPUs and wait for them to respond */ -+ send_IPI_allbutself(APIC_DM_NMI); -+ while (atomic_read(&data.started) != cpus) -+ barrier(); -+ -+ unset_nmi_ipi_callback(); -+ if (wait) -+ while (atomic_read(&data.finished) != cpus) -+ barrier(); -+ spin_unlock(&nmi_call_lock); -+ -+ return 0; -+} -+ - static void stop_this_cpu (void * dummy) - { - /* -diff -upr linux-2.6.16.orig/arch/i386/kernel/smpboot.c linux-2.6.16-026test009/arch/i386/kernel/smpboot.c ---- linux-2.6.16.orig/arch/i386/kernel/smpboot.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/smpboot.c 2006-04-19 15:02:12.000000000 +0400 -@@ -317,6 +317,10 @@ static void __init synchronize_tsc_bp (v - } - if (!buggy) - printk("passed.\n"); -+#ifdef CONFIG_VE -+ /* TSC reset. kill whatever might rely on old values */ -+ VE_TASK_INFO(current)->wakeup_stamp = 0; -+#endif - } - - static void __init synchronize_tsc_ap (void) -@@ -342,6 +346,10 @@ static void __init synchronize_tsc_ap (v - atomic_inc(&tsc_count_stop); - while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); - } -+#ifdef CONFIG_VE -+ /* TSC reset. kill whatever might rely on old values */ -+ VE_TASK_INFO(current)->wakeup_stamp = 0; -+#endif - } - #undef NR_LOOPS - -@@ -908,6 +916,13 @@ static int __devinit do_boot_cpu(int api - if (IS_ERR(idle)) - panic("failed fork for CPU %d", cpu); - idle->thread.eip = (unsigned long) start_secondary; -+ -+#ifdef CONFIG_VE -+ /* Cosmetic: sleep_time won't be changed afterwards for the idle -+ * thread; keep it 0 rather than -cycles. */ -+ VE_TASK_INFO(idle)->sleep_time = 0; -+#endif -+ - /* start_eip had better be page-aligned! */ - start_eip = setup_trampoline(); - -diff -upr linux-2.6.16.orig/arch/i386/kernel/sys_i386.c linux-2.6.16-026test009/arch/i386/kernel/sys_i386.c ---- linux-2.6.16.orig/arch/i386/kernel/sys_i386.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/sys_i386.c 2006-04-19 15:02:12.000000000 +0400 -@@ -217,7 +217,7 @@ asmlinkage int sys_uname(struct old_utsn - if (!name) - return -EFAULT; - down_read(&uts_sem); -- err=copy_to_user(name, &system_utsname, sizeof (*name)); -+ err=copy_to_user(name, &ve_utsname, sizeof (*name)); - up_read(&uts_sem); - return err?-EFAULT:0; - } -@@ -233,15 +233,15 @@ asmlinkage int sys_olduname(struct oldol - - down_read(&uts_sem); - -- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); -+ error = __copy_to_user(name->sysname,ve_utsname.sysname,__OLD_UTS_LEN); - error |= __put_user(0,name->sysname+__OLD_UTS_LEN); -- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); -+ error |= __copy_to_user(name->nodename,ve_utsname.nodename,__OLD_UTS_LEN); - error |= __put_user(0,name->nodename+__OLD_UTS_LEN); -- error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); -+ error |= __copy_to_user(name->release,ve_utsname.release,__OLD_UTS_LEN); - error |= __put_user(0,name->release+__OLD_UTS_LEN); -- error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); -+ error |= __copy_to_user(name->version,ve_utsname.version,__OLD_UTS_LEN); - error |= __put_user(0,name->version+__OLD_UTS_LEN); -- error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); -+ error |= __copy_to_user(name->machine,ve_utsname.machine,__OLD_UTS_LEN); - error |= __put_user(0,name->machine+__OLD_UTS_LEN); - - up_read(&uts_sem); -diff -upr linux-2.6.16.orig/arch/i386/kernel/syscall_table.S linux-2.6.16-026test009/arch/i386/kernel/syscall_table.S ---- linux-2.6.16.orig/arch/i386/kernel/syscall_table.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/syscall_table.S 2006-04-19 15:02:11.000000000 +0400 -@@ -310,3 +310,12 @@ ENTRY(sys_call_table) - .long sys_pselect6 - .long sys_ppoll - .long sys_unshare /* 310 */ -+ -+ .rept 510-(.-sys_call_table)/4 -+ .long sys_ni_syscall -+ .endr -+ -+ .long sys_getluid /* 510 */ -+ .long sys_setluid -+ .long sys_setublimit -+ .long sys_ubstat -diff -upr linux-2.6.16.orig/arch/i386/kernel/timers/timer_tsc.c linux-2.6.16-026test009/arch/i386/kernel/timers/timer_tsc.c ---- linux-2.6.16.orig/arch/i386/kernel/timers/timer_tsc.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/timers/timer_tsc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -94,7 +94,7 @@ static int count2; /* counter for mark_o - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ --static unsigned long fast_gettimeoffset_quotient; -+unsigned long fast_gettimeoffset_quotient; - - static unsigned long get_offset_tsc(void) - { -diff -upr linux-2.6.16.orig/arch/i386/kernel/traps.c linux-2.6.16-026test009/arch/i386/kernel/traps.c ---- linux-2.6.16.orig/arch/i386/kernel/traps.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/kernel/traps.c 2006-04-19 15:02:12.000000000 +0400 -@@ -116,8 +116,10 @@ static void print_addr_and_symbol(unsign - { - printk(log_lvl); - printk(" [<%08lx>] ", addr); -- print_symbol("%s", addr); -- printk("\n"); -+ if (decode_call_traces) { -+ print_symbol("%s", addr); -+ printk("\n"); -+ } - } - - static inline unsigned long print_context_stack(struct thread_info *tinfo, -@@ -167,7 +169,10 @@ static void show_trace_log_lvl(struct ta - if (!stack) - break; - printk(log_lvl); -- printk(" =======================\n"); -+ if (decode_call_traces) -+ printk(" =======================\n"); -+ else -+ printk(" =<ctx>= "); - } - } - -@@ -203,8 +208,13 @@ static void show_stack_log_lvl(struct ta - } - printk("\n"); - printk(log_lvl); -- printk("Call Trace:\n"); -+ if (decode_call_traces) -+ printk("Call Trace:\n"); -+ else -+ printk("Call Trace: "); - show_trace_log_lvl(task, esp, log_lvl); -+ if (!decode_call_traces) -+ printk("\n"); - } - - void show_stack(struct task_struct *task, unsigned long *esp) -@@ -220,6 +230,8 @@ void dump_stack(void) - unsigned long stack; - - show_trace(current, &stack); -+ if (!decode_call_traces) -+ printk("\n"); - } - - EXPORT_SYMBOL(dump_stack); -@@ -252,8 +264,11 @@ void show_registers(struct pt_regs *regs - regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); -- printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)", -- current->comm, current->pid, current_thread_info(), current); -+ printk(KERN_EMERG "Process %s (pid: %d, veid=%d, threadinfo=%p task=%p)", -+ current->comm, current->pid, -+ VEID(VE_TASK_INFO(current)->owner_env), -+ current_thread_info(), current); -+ - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. -@@ -299,9 +314,9 @@ static void handle_BUG(struct pt_regs *r - goto no_bug; - if (ud2 != 0x0b0f) - goto no_bug; -- if (__get_user(line, (unsigned short __user *)(eip + 2))) -+ if (__get_user(line, (unsigned short __user *)(eip + 4))) - goto bug; -- if (__get_user(file, (char * __user *)(eip + 4)) || -+ if (__get_user(file, (char * __user *)(eip + 7)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) - file = "<bad filename>"; - -@@ -316,6 +331,15 @@ bug: - printk(KERN_EMERG "Kernel BUG\n"); - } - -+int die_counter = 0; -+ -+static void inline check_kernel_csum_bug(void) -+{ -+ if (kernel_text_csum_broken) -+ printk("Kernel code checksum mismatch detected %d times\n", -+ kernel_text_csum_broken); -+} -+ - /* This is gone through when something in the kernel - * has done something bad and is about to be terminated. - */ -@@ -330,7 +354,6 @@ void die(const char * str, struct pt_reg - .lock_owner = -1, - .lock_owner_depth = 0 - }; -- static int die_counter; - unsigned long flags; - - if (die.lock_owner != raw_smp_processor_id()) { -@@ -370,6 +393,7 @@ void die(const char * str, struct pt_reg - } else - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - -+ check_kernel_csum_bug(); - bust_spinlocks(0); - die.lock_owner = -1; - spin_unlock_irqrestore(&die.lock, flags); -@@ -597,12 +621,27 @@ static void unknown_nmi_error(unsigned c - printk("Do you have a strange power saving mode enabled?\n"); - } - --static DEFINE_SPINLOCK(nmi_print_lock); -+/* -+ * Voyager doesn't implement these -+ */ -+void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info) -+{ -+} -+ -+#ifdef CONFIG_SMP -+int __attribute__((weak)) -+smp_nmi_call_function(smp_nmi_function func, void *info, int wait) -+{ -+ return 0; -+} -+#endif - - void die_nmi (struct pt_regs *regs, const char *msg) - { -+ static DEFINE_SPINLOCK(nmi_print_lock); -+ - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) == -- NOTIFY_STOP) -+ NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); -@@ -615,6 +654,11 @@ void die_nmi (struct pt_regs *regs, cons - printk(" on CPU%d, eip %08lx, registers:\n", - smp_processor_id(), regs->eip); - show_registers(regs); -+ smp_nmi_call_function(smp_show_regs, NULL, 1); -+ bust_spinlocks(1); -+ /* current CPU messages should go bottom */ -+ if (!decode_call_traces) -+ smp_show_regs(regs, NULL); - printk(KERN_EMERG "console shuts up ...\n"); - console_silent(); - spin_unlock(&nmi_print_lock); -@@ -631,6 +675,14 @@ void die_nmi (struct pt_regs *regs, cons - do_exit(SIGSEGV); - } - -+static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -+{ -+ return 0; -+} -+ -+static nmi_callback_t nmi_callback = dummy_nmi_callback; -+static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; -+ - static void default_do_nmi(struct pt_regs * regs) - { - unsigned char reason = 0; -@@ -653,6 +705,9 @@ static void default_do_nmi(struct pt_reg - return; - } - #endif -+ if (nmi_ipi_callback != dummy_nmi_callback) -+ return; -+ - unknown_nmi_error(reason, regs); - return; - } -@@ -669,13 +724,6 @@ static void default_do_nmi(struct pt_reg - reassert_nmi(); - } - --static int dummy_nmi_callback(struct pt_regs * regs, int cpu) --{ -- return 0; --} -- --static nmi_callback_t nmi_callback = dummy_nmi_callback; -- - fastcall void do_nmi(struct pt_regs * regs, long error_code) - { - int cpu; -@@ -689,9 +737,20 @@ fastcall void do_nmi(struct pt_regs * re - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); - -+ nmi_ipi_callback(regs, cpu); - nmi_exit(); - } - -+void set_nmi_ipi_callback(nmi_callback_t callback) -+{ -+ nmi_ipi_callback = callback; -+} -+ -+void unset_nmi_ipi_callback(void) -+{ -+ nmi_ipi_callback = dummy_nmi_callback; -+} -+ - void set_nmi_callback(nmi_callback_t callback) - { - rcu_assign_pointer(nmi_callback, callback); -diff -upr linux-2.6.16.orig/arch/i386/mm/fault.c linux-2.6.16-026test009/arch/i386/mm/fault.c ---- linux-2.6.16.orig/arch/i386/mm/fault.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/mm/fault.c 2006-04-19 15:02:12.000000000 +0400 -@@ -31,32 +31,6 @@ - extern void die(const char *,struct pt_regs *,long); - - /* -- * Unlock any spinlocks which will prevent us from getting the -- * message out -- */ --void bust_spinlocks(int yes) --{ -- int loglevel_save = console_loglevel; -- -- if (yes) { -- oops_in_progress = 1; -- return; -- } --#ifdef CONFIG_VT -- unblank_screen(); --#endif -- oops_in_progress = 0; -- /* -- * OK, the message is on the console. Now we call printk() -- * without oops_in_progress set so that printk will give klogd -- * a poke. Hold onto your hats... -- */ -- console_loglevel = 15; /* NMI oopser may have shut the console up */ -- printk(" "); -- console_loglevel = loglevel_save; --} -- --/* - * Return EIP plus the CS segment base. The segment limit is also - * adjusted, clamped to the kernel/user address space (whichever is - * appropriate), and returned in *eip_limit. -@@ -347,7 +321,6 @@ good_area: - goto bad_area; - } - -- survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo -@@ -485,14 +458,14 @@ no_context: - */ - out_of_memory: - up_read(&mm->mmap_sem); -- if (tsk->pid == 1) { -- yield(); -- down_read(&mm->mmap_sem); -- goto survive; -+ if (error_code & 4) { -+ /* -+ * 0-order allocation always success if something really -+ * fatal not happen: beancounter overdraft or OOM. -+ */ -+ force_sig(SIGKILL, tsk); -+ return; - } -- printk("VM: killing process %s\n", tsk->comm); -- if (error_code & 4) -- do_exit(SIGKILL); - goto no_context; - - do_sigbus: -diff -upr linux-2.6.16.orig/arch/i386/mm/init.c linux-2.6.16-026test009/arch/i386/mm/init.c ---- linux-2.6.16.orig/arch/i386/mm/init.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/mm/init.c 2006-04-19 15:02:11.000000000 +0400 -@@ -677,7 +677,7 @@ void __init pgtable_cache_init(void) - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), -- 0, -+ SLAB_UBC, - pmd_ctor, - NULL); - if (!pmd_cache) -@@ -686,7 +686,7 @@ void __init pgtable_cache_init(void) - pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), -- 0, -+ SLAB_UBC, - pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); - if (!pgd_cache) -diff -upr linux-2.6.16.orig/arch/i386/mm/pgtable.c linux-2.6.16-026test009/arch/i386/mm/pgtable.c ---- linux-2.6.16.orig/arch/i386/mm/pgtable.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/i386/mm/pgtable.c 2006-04-19 15:02:12.000000000 +0400 -@@ -5,8 +5,10 @@ - #include <linux/config.h> - #include <linux/sched.h> - #include <linux/kernel.h> -+#include <linux/module.h> - #include <linux/errno.h> - #include <linux/mm.h> -+#include <linux/vmalloc.h> - #include <linux/swap.h> - #include <linux/smp.h> - #include <linux/highmem.h> -@@ -64,7 +66,9 @@ void show_mem(void) - printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); - printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); - printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); -+ vprintstat(); - } -+EXPORT_SYMBOL(show_mem); - - /* - * Associate a virtual page frame with a given physical page frame -@@ -159,9 +163,11 @@ struct page *pte_alloc_one(struct mm_str - struct page *pte; - - #ifdef CONFIG_HIGHPTE -- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -+ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_HIGHMEM| -+ __GFP_REPEAT|__GFP_ZERO, 0); - #else -- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -+ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC| -+ __GFP_REPEAT|__GFP_ZERO, 0); - #endif - return pte; - } -diff -upr linux-2.6.16.orig/arch/ia64/Kconfig linux-2.6.16-026test009/arch/ia64/Kconfig ---- linux-2.6.16.orig/arch/ia64/Kconfig 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/Kconfig 2006-04-19 15:02:12.000000000 +0400 -@@ -464,6 +464,10 @@ endmenu - - source "arch/ia64/Kconfig.debug" - -+source "kernel/Kconfig.openvz" -+ - source "security/Kconfig" - - source "crypto/Kconfig" -+ -+source "kernel/ub/Kconfig" -diff -upr linux-2.6.16.orig/arch/ia64/ia32/binfmt_elf32.c linux-2.6.16-026test009/arch/ia64/ia32/binfmt_elf32.c ---- linux-2.6.16.orig/arch/ia64/ia32/binfmt_elf32.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/ia32/binfmt_elf32.c 2006-04-19 15:02:11.000000000 +0400 -@@ -136,6 +136,12 @@ ia64_elf32_init (struct pt_regs *regs) - up_write(¤t->mm->mmap_sem); - } - -+ if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * -+ IA32_LDT_ENTRY_SIZE), -+ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, -+ NULL, UB_SOFT)) -+ goto skip; -+ - /* - * Install LDT as anonymous memory. This gives us all-zero segment descriptors - * until a task modifies them via modify_ldt(). -@@ -157,7 +163,12 @@ ia64_elf32_init (struct pt_regs *regs) - } - } - up_write(¤t->mm->mmap_sem); -- } -+ } else -+ ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * -+ IA32_LDT_ENTRY_SIZE), -+ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL); -+ -+skip: - - ia64_psr(regs)->ac = 0; /* turn off alignment checking */ - regs->loadrs = 0; -@@ -212,9 +223,15 @@ ia32_setup_arg_pages (struct linux_binpr - bprm->loader += stack_base; - bprm->exec += stack_base; - -+ ret = -ENOMEM; -+ if (ub_memory_charge(mm, IA32_STACK_TOP - -+ (PAGE_MASK & (unsigned long)bprm->p), -+ VM_STACK_FLAGS, NULL, UB_SOFT)) -+ goto err_charge; -+ - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!mpnt) -- return -ENOMEM; -+ goto err_alloc; - - memset(mpnt, 0, sizeof(*mpnt)); - -@@ -231,11 +248,8 @@ ia32_setup_arg_pages (struct linux_binpr - mpnt->vm_flags = VM_STACK_FLAGS; - mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)? - PAGE_COPY_EXEC: PAGE_COPY; -- if ((ret = insert_vm_struct(current->mm, mpnt))) { -- up_write(¤t->mm->mmap_sem); -- kmem_cache_free(vm_area_cachep, mpnt); -- return ret; -- } -+ if ((ret = insert_vm_struct(current->mm, mpnt))) -+ goto err_insert; - current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt); - } - -@@ -254,6 +268,16 @@ ia32_setup_arg_pages (struct linux_binpr - current->thread.ppl = ia32_init_pp_list(); - - return 0; -+ -+err_insert: -+ up_write(¤t->mm->mmap_sem); -+ kmem_cache_free(vm_area_cachep, mpnt); -+err_alloc: -+ ub_memory_uncharge(mm, IA32_STACK_TOP - -+ (PAGE_MASK & (unsigned long)bprm->p), -+ VM_STACK_FLAGS, NULL); -+err_charge: -+ return ret; - } - - static void -diff -upr linux-2.6.16.orig/arch/ia64/kernel/asm-offsets.c linux-2.6.16-026test009/arch/ia64/kernel/asm-offsets.c ---- linux-2.6.16.orig/arch/ia64/kernel/asm-offsets.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/asm-offsets.c 2006-04-19 15:02:12.000000000 +0400 -@@ -44,11 +44,21 @@ void foo(void) - DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); - DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); - DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); -+#ifdef CONFIG_VE -+ DEFINE(IA64_TASK_PID_OFFSET, offsetof -+ (struct task_struct, pids[PIDTYPE_PID].vnr)); -+#else - DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); -+#endif - DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); - DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand)); - DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal)); -+#ifdef CONFIG_VE -+ DEFINE(IA64_TASK_TGID_OFFSET, offsetof -+ (struct task_struct, pids[PIDTYPE_TGID].vnr)); -+#else - DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid)); -+#endif - DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp)); - DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack)); - -diff -upr linux-2.6.16.orig/arch/ia64/kernel/entry.S linux-2.6.16-026test009/arch/ia64/kernel/entry.S ---- linux-2.6.16.orig/arch/ia64/kernel/entry.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/entry.S 2006-04-19 15:02:11.000000000 +0400 -@@ -1620,4 +1620,12 @@ sys_call_table: - data8 sys_ni_syscall // 1295 reserved for ppoll - data8 sys_unshare - -+.rept 1505-1297 -+ data8 sys_ni_syscall -+.endr -+ data8 sys_getluid // 1505 -+ data8 sys_setluid -+ data8 sys_setublimit -+ data8 sys_ubstat -+ - .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls -diff -upr linux-2.6.16.orig/arch/ia64/kernel/fsys.S linux-2.6.16-026test009/arch/ia64/kernel/fsys.S ---- linux-2.6.16.orig/arch/ia64/kernel/fsys.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/fsys.S 2006-04-19 15:02:12.000000000 +0400 -@@ -72,6 +72,7 @@ ENTRY(fsys_getpid) - FSYS_RETURN - END(fsys_getpid) - -+#ifndef CONFIG_VE - ENTRY(fsys_getppid) - .prologue - .altrp b6 -@@ -118,6 +119,7 @@ ENTRY(fsys_getppid) - #endif - FSYS_RETURN - END(fsys_getppid) -+#endif - - ENTRY(fsys_set_tid_address) - .prologue -@@ -665,7 +667,11 @@ fsyscall_table: - data8 0 // chown - data8 0 // lseek // 1040 - data8 fsys_getpid // getpid -+#ifdef CONFIG_VE -+ data8 0 -+#else - data8 fsys_getppid // getppid -+#endif - data8 0 // mount - data8 0 // umount - data8 0 // setuid // 1045 -diff -upr linux-2.6.16.orig/arch/ia64/kernel/irq.c linux-2.6.16-026test009/arch/ia64/kernel/irq.c ---- linux-2.6.16.orig/arch/ia64/kernel/irq.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/irq.c 2006-04-19 15:02:12.000000000 +0400 -@@ -163,7 +163,9 @@ void fixup_irqs(void) - { - unsigned int irq; - extern void ia64_process_pending_intr(void); -+ struct ve_struct *ve; - -+ ve = set_exec_env(get_ve0()); - ia64_set_itv(1<<16); - /* - * Phase 1: Locate irq's bound to this cpu and -@@ -197,5 +199,6 @@ void fixup_irqs(void) - */ - max_xtp(); - local_irq_disable(); -+ (void)set_exec_env(ve); - } - #endif -diff -upr linux-2.6.16.orig/arch/ia64/kernel/irq_ia64.c linux-2.6.16-026test009/arch/ia64/kernel/irq_ia64.c ---- linux-2.6.16.orig/arch/ia64/kernel/irq_ia64.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/irq_ia64.c 2006-04-19 15:02:12.000000000 +0400 -@@ -103,6 +103,7 @@ void - ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) - { - unsigned long saved_tpr; -+ struct ve_struct *ve; - - #if IRQ_DEBUG - { -@@ -139,6 +140,7 @@ ia64_handle_irq (ia64_vector vector, str - * 16 (without this, it would be ~240, which could easily lead - * to kernel stack overflows). - */ -+ ve = set_exec_env(get_ve0()); - irq_enter(); - saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); - ia64_srlz_d(); -@@ -164,6 +166,7 @@ ia64_handle_irq (ia64_vector vector, str - * come through until ia64_eoi() has been done. - */ - irq_exit(); -+ (void)set_exec_env(get_ve0()); - } - - #ifdef CONFIG_HOTPLUG_CPU -@@ -176,9 +179,11 @@ void ia64_process_pending_intr(void) - ia64_vector vector; - unsigned long saved_tpr; - extern unsigned int vectors_in_migration[NR_IRQS]; -+ struct ve_struct *ve; - - vector = ia64_get_ivr(); - -+ ve = set_exec_env(get_ve0()); - irq_enter(); - saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); - ia64_srlz_d(); -@@ -210,6 +215,7 @@ void ia64_process_pending_intr(void) - vector = ia64_get_ivr(); - } - irq_exit(); -+ (void)set_exec_env(ve); - } - #endif - -diff -upr linux-2.6.16.orig/arch/ia64/kernel/mca.c linux-2.6.16-026test009/arch/ia64/kernel/mca.c ---- linux-2.6.16.orig/arch/ia64/kernel/mca.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/mca.c 2006-04-19 15:02:12.000000000 +0400 -@@ -1241,10 +1241,10 @@ default_monarch_init_process(struct noti - } - printk("\n\n"); - if (read_trylock(&tasklist_lock)) { -- do_each_thread (g, t) { -+ do_each_thread_all (g, t) { - printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); - show_stack(t, NULL); -- } while_each_thread (g, t); -+ } while_each_thread_all (g, t); - read_unlock(&tasklist_lock); - } - return NOTIFY_DONE; -diff -upr linux-2.6.16.orig/arch/ia64/kernel/perfmon.c linux-2.6.16-026test009/arch/ia64/kernel/perfmon.c ---- linux-2.6.16.orig/arch/ia64/kernel/perfmon.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/perfmon.c 2006-04-19 15:02:12.000000000 +0400 -@@ -2624,7 +2624,7 @@ pfm_get_task(pfm_context_t *ctx, pid_t p - - read_lock(&tasklist_lock); - -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - - /* make sure task cannot go away while we operate on it */ - if (p) get_task_struct(p); -@@ -4188,12 +4188,12 @@ pfm_check_task_exist(pfm_context_t *ctx) - - read_lock(&tasklist_lock); - -- do_each_thread (g, t) { -+ do_each_thread_ve (g, t) { - if (t->thread.pfm_context == ctx) { - ret = 0; - break; - } -- } while_each_thread (g, t); -+ } while_each_thread_ve (g, t); - - read_unlock(&tasklist_lock); - -diff -upr linux-2.6.16.orig/arch/ia64/kernel/process.c linux-2.6.16-026test009/arch/ia64/kernel/process.c ---- linux-2.6.16.orig/arch/ia64/kernel/process.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/process.c 2006-04-19 15:02:12.000000000 +0400 -@@ -681,6 +681,13 @@ kernel_thread (int (*fn)(void *), void * - struct pt_regs pt; - } regs; - -+ /* Don't allow kernel_thread() inside VE */ -+ if (!ve_is_super(get_exec_env())) { -+ printk("kernel_thread call inside VE\n"); -+ dump_stack(); -+ return -EPERM; -+ } -+ - memset(®s, 0, sizeof(regs)); - regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ - regs.pt.r1 = helper_fptr[1]; /* set GP */ -diff -upr linux-2.6.16.orig/arch/ia64/kernel/ptrace.c linux-2.6.16-026test009/arch/ia64/kernel/ptrace.c ---- linux-2.6.16.orig/arch/ia64/kernel/ptrace.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/ptrace.c 2006-04-19 15:02:12.000000000 +0400 -@@ -1433,7 +1433,7 @@ sys_ptrace (long request, pid_t pid, uns - ret = -ESRCH; - read_lock(&tasklist_lock); - { -- child = find_task_by_pid(pid); -+ child = find_task_by_pid_ve(pid); - if (child) { - if (peek_or_poke) - child = find_thread_for_addr(child, addr); -diff -upr linux-2.6.16.orig/arch/ia64/kernel/signal.c linux-2.6.16-026test009/arch/ia64/kernel/signal.c ---- linux-2.6.16.orig/arch/ia64/kernel/signal.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/signal.c 2006-04-19 15:02:12.000000000 +0400 -@@ -270,7 +270,7 @@ ia64_rt_sigreturn (struct sigscratch *sc - si.si_signo = SIGSEGV; - si.si_errno = 0; - si.si_code = SI_KERNEL; -- si.si_pid = current->pid; -+ si.si_pid = virt_pid(current); - si.si_uid = current->uid; - si.si_addr = sc; - force_sig_info(SIGSEGV, &si, current); -@@ -375,7 +375,7 @@ force_sigsegv_info (int sig, void __user - si.si_signo = SIGSEGV; - si.si_errno = 0; - si.si_code = SI_KERNEL; -- si.si_pid = current->pid; -+ si.si_pid = virt_pid(current); - si.si_uid = current->uid; - si.si_addr = addr; - force_sig_info(SIGSEGV, &si, current); -@@ -641,7 +641,7 @@ set_sigdelayed(pid_t pid, int signo, int - for (i = 1; i <= 3; ++i) { - switch (i) { - case 1: -- t = find_task_by_pid(pid); -+ t = find_task_by_pid_ve(pid); - if (t) - start_time = start_time_ul(t); - break; -@@ -682,7 +682,7 @@ do_sigdelayed(void) - siginfo.si_code = current_thread_info()->sigdelayed.code; - siginfo.si_addr = current_thread_info()->sigdelayed.addr; - pid = current_thread_info()->sigdelayed.pid; -- t = find_task_by_pid(pid); -+ t = find_task_by_pid_ve(pid); - if (!t) - return; - if (current_thread_info()->sigdelayed.start_time != start_time_ul(t)) -diff -upr linux-2.6.16.orig/arch/ia64/kernel/traps.c linux-2.6.16-026test009/arch/ia64/kernel/traps.c ---- linux-2.6.16.orig/arch/ia64/kernel/traps.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/traps.c 2006-04-19 15:02:11.000000000 +0400 -@@ -54,34 +54,6 @@ trap_init (void) - fpswa_interface = __va(ia64_boot_param->fpswa); - } - --/* -- * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock -- * is acquired through the console unblank code) -- */ --void --bust_spinlocks (int yes) --{ -- int loglevel_save = console_loglevel; -- -- if (yes) { -- oops_in_progress = 1; -- return; -- } -- --#ifdef CONFIG_VT -- unblank_screen(); --#endif -- oops_in_progress = 0; -- /* -- * OK, the message is on the console. Now we call printk() without -- * oops_in_progress set so that printk will give klogd a poke. Hold onto -- * your hats... -- */ -- console_loglevel = 15; /* NMI oopser may have shut the console up */ -- printk(" "); -- console_loglevel = loglevel_save; --} -- - void - die (const char *str, struct pt_regs *regs, long err) - { -diff -upr linux-2.6.16.orig/arch/ia64/kernel/unaligned.c linux-2.6.16-026test009/arch/ia64/kernel/unaligned.c ---- linux-2.6.16.orig/arch/ia64/kernel/unaligned.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/kernel/unaligned.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1290,7 +1290,7 @@ within_logging_rate_limit (void) - { - static unsigned long count, last_time; - -- if (jiffies - last_time > 5*HZ) -+ if (jiffies - last_time > 60 * HZ) - count = 0; - if (count < 5) { - last_time = jiffies; -diff -upr linux-2.6.16.orig/arch/ia64/mm/contig.c linux-2.6.16-026test009/arch/ia64/mm/contig.c ---- linux-2.6.16.orig/arch/ia64/mm/contig.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/mm/contig.c 2006-04-19 15:02:12.000000000 +0400 -@@ -64,6 +64,7 @@ show_mem (void) - printk("%ld pages in page table cache\n", - pgtable_quicklist_total_size()); - } -+EXPORT_SYMBOL(show_mem); - - /* physical address where the bootmem map is located */ - unsigned long bootmap_start; -diff -upr linux-2.6.16.orig/arch/ia64/mm/discontig.c linux-2.6.16-026test009/arch/ia64/mm/discontig.c ---- linux-2.6.16.orig/arch/ia64/mm/discontig.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/mm/discontig.c 2006-04-19 15:02:12.000000000 +0400 -@@ -594,6 +594,7 @@ void show_mem(void) - pgtable_quicklist_total_size()); - printk("%d free buffer pages\n", nr_free_buffer_pages()); - } -+EXPORT_SYMBOL(show_mem); - - /** - * call_pernode_memory - use SRAT to call callback functions with node info -diff -upr linux-2.6.16.orig/arch/ia64/mm/fault.c linux-2.6.16-026test009/arch/ia64/mm/fault.c ---- linux-2.6.16.orig/arch/ia64/mm/fault.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/mm/fault.c 2006-04-19 15:02:12.000000000 +0400 -@@ -116,7 +116,6 @@ ia64_do_page_fault (unsigned long addres - if ((vma->vm_flags & mask) != mask) - goto bad_area; - -- survive: - /* - * If for any reason at all we couldn't handle the fault, make - * sure we exit gracefully rather than endlessly redo the -@@ -241,13 +240,13 @@ ia64_do_page_fault (unsigned long addres - - out_of_memory: - up_read(&mm->mmap_sem); -- if (current->pid == 1) { -- yield(); -- down_read(&mm->mmap_sem); -- goto survive; -- } -- printk(KERN_CRIT "VM: killing process %s\n", current->comm); -- if (user_mode(regs)) -- do_exit(SIGKILL); -+ if (user_mode(regs)) { -+ /* -+ * 0-order allocation always success if something really -+ * fatal not happen: beancounter overdraft or OOM. -+ */ -+ force_sig(SIGKILL, current); -+ return; -+ } - goto no_context; - } -diff -upr linux-2.6.16.orig/arch/ia64/mm/init.c linux-2.6.16-026test009/arch/ia64/mm/init.c ---- linux-2.6.16.orig/arch/ia64/mm/init.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ia64/mm/init.c 2006-04-19 15:02:11.000000000 +0400 -@@ -37,6 +37,8 @@ - #include <asm/unistd.h> - #include <asm/mca.h> - -+#include <ub/ub_vmpages.h> -+ - DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - - DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist); -@@ -96,7 +98,7 @@ check_pgt_cache(void) - preempt_disable(); - while (unlikely((pages_to_free = min_pages_to_free()) > 0)) { - while (pages_to_free--) { -- free_page((unsigned long)pgtable_quicklist_alloc()); -+ free_page((unsigned long)pgtable_quicklist_alloc(0)); - } - preempt_enable(); - preempt_disable(); -@@ -146,6 +148,10 @@ ia64_init_addr_space (void) - - ia64_set_rbs_bot(); - -+ if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS, -+ NULL, UB_SOFT)) -+ goto skip; -+ - /* - * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore - * the problem. When the process attempts to write to the register backing store -@@ -166,8 +172,11 @@ ia64_init_addr_space (void) - return; - } - up_write(¤t->mm->mmap_sem); -- } -+ } else -+ ub_memory_uncharge(current->mm, PAGE_SIZE, -+ VM_DATA_DEFAULT_FLAGS, NULL); - -+skip: - /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ - if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); -diff -upr linux-2.6.16.orig/arch/m32r/kernel/m32r_ksyms.c linux-2.6.16-026test009/arch/m32r/kernel/m32r_ksyms.c ---- linux-2.6.16.orig/arch/m32r/kernel/m32r_ksyms.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/m32r/kernel/m32r_ksyms.c 2006-04-19 15:02:11.000000000 +0400 -@@ -38,10 +38,6 @@ EXPORT_SYMBOL(__udelay); - EXPORT_SYMBOL(__delay); - EXPORT_SYMBOL(__const_udelay); - --EXPORT_SYMBOL(__get_user_1); --EXPORT_SYMBOL(__get_user_2); --EXPORT_SYMBOL(__get_user_4); -- - EXPORT_SYMBOL(strpbrk); - EXPORT_SYMBOL(strstr); - -diff -upr linux-2.6.16.orig/arch/m32r/kernel/setup.c linux-2.6.16-026test009/arch/m32r/kernel/setup.c ---- linux-2.6.16.orig/arch/m32r/kernel/setup.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/m32r/kernel/setup.c 2006-04-19 15:02:11.000000000 +0400 -@@ -9,6 +9,7 @@ - - #include <linux/config.h> - #include <linux/init.h> -+#include <linux/kernel.h> - #include <linux/stddef.h> - #include <linux/fs.h> - #include <linux/sched.h> -@@ -218,8 +219,6 @@ static unsigned long __init setup_memory - extern unsigned long setup_memory(void); - #endif /* CONFIG_DISCONTIGMEM */ - --#define M32R_PCC_PCATCR 0x00ef7014 /* will move to m32r.h */ -- - void __init setup_arch(char **cmdline_p) - { - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); -@@ -268,15 +267,14 @@ void __init setup_arch(char **cmdline_p) - paging_init(); - } - --static struct cpu cpu[NR_CPUS]; -+static struct cpu cpu_devices[NR_CPUS]; - - static int __init topology_init(void) - { -- int cpu_id; -+ int i; - -- for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) -- if (cpu_possible(cpu_id)) -- register_cpu(&cpu[cpu_id], cpu_id, NULL); -+ for_each_present_cpu(i) -+ register_cpu(&cpu_devices[i], i, NULL); - - return 0; - } -diff -upr linux-2.6.16.orig/arch/m32r/kernel/smpboot.c linux-2.6.16-026test009/arch/m32r/kernel/smpboot.c ---- linux-2.6.16.orig/arch/m32r/kernel/smpboot.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/m32r/kernel/smpboot.c 2006-04-19 15:02:11.000000000 +0400 -@@ -39,8 +39,10 @@ - * Martin J. Bligh : Added support for multi-quad systems - */ - -+#include <linux/module.h> - #include <linux/config.h> - #include <linux/init.h> -+#include <linux/kernel.h> - #include <linux/mm.h> - #include <linux/smp_lock.h> - #include <linux/irq.h> -@@ -72,11 +74,15 @@ physid_mask_t phys_cpu_present_map; - - /* Bitmask of currently online CPUs */ - cpumask_t cpu_online_map; -+EXPORT_SYMBOL(cpu_online_map); - - cpumask_t cpu_bootout_map; - cpumask_t cpu_bootin_map; --cpumask_t cpu_callout_map; - static cpumask_t cpu_callin_map; -+cpumask_t cpu_callout_map; -+EXPORT_SYMBOL(cpu_callout_map); -+cpumask_t cpu_possible_map = CPU_MASK_ALL; -+EXPORT_SYMBOL(cpu_possible_map); - - /* Per CPU bogomips and other parameters */ - struct cpuinfo_m32r cpu_data[NR_CPUS] __cacheline_aligned; -@@ -110,7 +116,6 @@ static unsigned int calibration_result; - - void smp_prepare_boot_cpu(void); - void smp_prepare_cpus(unsigned int); --static void smp_tune_scheduling(void); - static void init_ipi_lock(void); - static void do_boot_cpu(int); - int __cpu_up(unsigned int); -@@ -177,6 +182,9 @@ void __init smp_prepare_cpus(unsigned in - } - for (phys_id = 0 ; phys_id < nr_cpu ; phys_id++) - physid_set(phys_id, phys_cpu_present_map); -+#ifndef CONFIG_HOTPLUG_CPU -+ cpu_present_map = cpu_possible_map; -+#endif - - show_mp_info(nr_cpu); - -@@ -186,7 +194,6 @@ void __init smp_prepare_cpus(unsigned in - * Setup boot CPU information - */ - smp_store_cpu_info(0); /* Final full version of the data */ -- smp_tune_scheduling(); - - /* - * If SMP should be disabled, then really disable it! -@@ -230,11 +237,6 @@ smp_done: - Dprintk("Boot done.\n"); - } - --static void __init smp_tune_scheduling(void) --{ -- /* Nothing to do. */ --} -- - /* - * init_ipi_lock : Initialize IPI locks. - */ -@@ -629,4 +631,3 @@ static void __init unmap_cpu_to_physid(i - physid_2_cpu[phys_id] = -1; - cpu_2_physid[cpu_id] = -1; - } -- -diff -upr linux-2.6.16.orig/arch/m32r/lib/Makefile linux-2.6.16-026test009/arch/m32r/lib/Makefile ---- linux-2.6.16.orig/arch/m32r/lib/Makefile 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/m32r/lib/Makefile 2006-04-19 15:02:11.000000000 +0400 -@@ -2,6 +2,6 @@ - # Makefile for M32R-specific library files.. - # - --lib-y := checksum.o ashxdi3.o memset.o memcpy.o getuser.o \ -- putuser.o delay.o strlen.o usercopy.o csum_partial_copy.o -+lib-y := checksum.o ashxdi3.o memset.o memcpy.o \ -+ delay.o strlen.o usercopy.o csum_partial_copy.o - -diff -upr linux-2.6.16.orig/arch/mips/kernel/irixelf.c linux-2.6.16-026test009/arch/mips/kernel/irixelf.c ---- linux-2.6.16.orig/arch/mips/kernel/irixelf.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/mips/kernel/irixelf.c 2006-04-19 15:02:11.000000000 +0400 -@@ -432,7 +432,7 @@ static inline int look_for_irix_interpre - if (retval < 0) - goto out; - -- file = open_exec(*name); -+ file = open_exec(*name, bprm); - if (IS_ERR(file)) { - retval = PTR_ERR(file); - goto out; -diff -upr linux-2.6.16.orig/arch/mips/kernel/sysirix.c linux-2.6.16-026test009/arch/mips/kernel/sysirix.c ---- linux-2.6.16.orig/arch/mips/kernel/sysirix.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/mips/kernel/sysirix.c 2006-04-19 15:02:12.000000000 +0400 -@@ -110,7 +110,7 @@ asmlinkage int irix_prctl(unsigned optio - printk("irix_prctl[%s:%d]: Wants PR_ISBLOCKED\n", - current->comm, current->pid); - read_lock(&tasklist_lock); -- task = find_task_by_pid(va_arg(args, pid_t)); -+ task = find_task_by_pid_ve(va_arg(args, pid_t)); - error = -ESRCH; - if (error) - error = (task->run_list.next != NULL); -diff -upr linux-2.6.16.orig/arch/powerpc/Kconfig linux-2.6.16-026test009/arch/powerpc/Kconfig ---- linux-2.6.16.orig/arch/powerpc/Kconfig 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/Kconfig 2006-04-19 15:02:12.000000000 +0400 -@@ -956,6 +956,8 @@ source "arch/powerpc/platforms/iseries/K - - source "lib/Kconfig" - -+source "kernel/ub/Kconfig" -+ - menu "Instrumentation Support" - depends on EXPERIMENTAL - -@@ -974,6 +976,8 @@ endmenu - - source "arch/powerpc/Kconfig.debug" - -+source "kernel/Kconfig.openvz" -+ - source "security/Kconfig" - - config KEYS_COMPAT -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/irq.c linux-2.6.16-026test009/arch/powerpc/kernel/irq.c ---- linux-2.6.16.orig/arch/powerpc/kernel/irq.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/irq.c 2006-04-19 15:02:12.000000000 +0400 -@@ -50,6 +50,8 @@ - #include <linux/profile.h> - #include <linux/bitops.h> - -+#include <ub/beancounter.h> -+ - #include <asm/uaccess.h> - #include <asm/system.h> - #include <asm/io.h> -@@ -189,7 +191,11 @@ void do_IRQ(struct pt_regs *regs) - #ifdef CONFIG_IRQSTACKS - struct thread_info *curtp, *irqtp; - #endif -+ struct ve_struct *ve; -+ struct user_beancounter *ub; - -+ ve = set_exec_env(get_ve0()); -+ ub = set_exec_ub(get_ub0()); - irq_enter(); - - #ifdef CONFIG_DEBUG_STACKOVERFLOW -@@ -236,6 +242,8 @@ void do_IRQ(struct pt_regs *regs) - ppc_spurious_interrupts++; - - irq_exit(); -+ (void)set_exec_ub(ub); -+ (void)set_exec_env(ve); - - #ifdef CONFIG_PPC_ISERIES - if (get_lppaca()->int_dword.fields.decr_int) { -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/misc_32.S linux-2.6.16-026test009/arch/powerpc/kernel/misc_32.S ---- linux-2.6.16.orig/arch/powerpc/kernel/misc_32.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/misc_32.S 2006-04-19 15:02:12.000000000 +0400 -@@ -973,7 +973,7 @@ _GLOBAL(_get_SP) - * Create a kernel thread - * kernel_thread(fn, arg, flags) - */ --_GLOBAL(kernel_thread) -+_GLOBAL(ppc_kernel_thread) - stwu r1,-16(r1) - stw r30,8(r1) - stw r31,12(r1) -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/misc_64.S linux-2.6.16-026test009/arch/powerpc/kernel/misc_64.S ---- linux-2.6.16.orig/arch/powerpc/kernel/misc_64.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/misc_64.S 2006-04-19 15:02:12.000000000 +0400 -@@ -677,7 +677,7 @@ _GLOBAL(scom970_write) - * Create a kernel thread - * kernel_thread(fn, arg, flags) - */ --_GLOBAL(kernel_thread) -+_GLOBAL(ppc_kernel_thread) - std r29,-24(r1) - std r30,-16(r1) - stdu r1,-STACK_FRAME_OVERHEAD(r1) -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/pci_64.c linux-2.6.16-026test009/arch/powerpc/kernel/pci_64.c ---- linux-2.6.16.orig/arch/powerpc/kernel/pci_64.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/pci_64.c 2006-04-19 15:02:11.000000000 +0400 -@@ -78,6 +78,7 @@ int global_phb_number; /* Global phb co - - /* Cached ISA bridge dev. */ - struct pci_dev *ppc64_isabridge_dev = NULL; -+EXPORT_SYMBOL_GPL(ppc64_isabridge_dev); - - static void fixup_broken_pcnet32(struct pci_dev* dev) - { -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/process.c linux-2.6.16-026test009/arch/powerpc/kernel/process.c ---- linux-2.6.16.orig/arch/powerpc/kernel/process.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/process.c 2006-04-19 15:02:12.000000000 +0400 -@@ -889,6 +889,20 @@ void dump_stack(void) - } - EXPORT_SYMBOL(dump_stack); - -+long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -+{ -+ extern long ppc_kernel_thread(int (*fn)(void *), void *arg, -+ unsigned long flags); -+ -+ if (!ve_is_super(get_exec_env())) { -+ printk("kernel_thread call inside VE\n"); -+ dump_stack(); -+ return -EPERM; -+ } -+ -+ return ppc_kernel_thread(fn, arg, flags); -+} -+ - #ifdef CONFIG_PPC64 - void ppc64_runlatch_on(void) - { -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/setup_64.c linux-2.6.16-026test009/arch/powerpc/kernel/setup_64.c ---- linux-2.6.16.orig/arch/powerpc/kernel/setup_64.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/setup_64.c 2006-04-19 15:02:11.000000000 +0400 -@@ -256,12 +256,10 @@ void __init early_setup(unsigned long dt - /* - * Initialize stab / SLB management except on iSeries - */ -- if (!firmware_has_feature(FW_FEATURE_ISERIES)) { -- if (cpu_has_feature(CPU_FTR_SLB)) -- slb_initialize(); -- else -- stab_initialize(lpaca->stab_real); -- } -+ if (cpu_has_feature(CPU_FTR_SLB)) -+ slb_initialize(); -+ else if (!firmware_has_feature(FW_FEATURE_ISERIES)) -+ stab_initialize(lpaca->stab_real); - - DBG(" <- early_setup()\n"); - } -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/signal_64.c linux-2.6.16-026test009/arch/powerpc/kernel/signal_64.c ---- linux-2.6.16.orig/arch/powerpc/kernel/signal_64.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/signal_64.c 2006-04-19 15:02:11.000000000 +0400 -@@ -213,7 +213,7 @@ static inline void __user * get_sigframe - /* Default to using normal stack */ - newsp = regs->gpr[1]; - -- if (ka->sa.sa_flags & SA_ONSTACK) { -+ if ((ka->sa.sa_flags & SA_ONSTACK) && current->sas_ss_size) { - if (! on_sig_stack(regs->gpr[1])) - newsp = (current->sas_ss_sp + current->sas_ss_size); - } -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/syscalls.c linux-2.6.16-026test009/arch/powerpc/kernel/syscalls.c ---- linux-2.6.16.orig/arch/powerpc/kernel/syscalls.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/syscalls.c 2006-04-19 15:02:12.000000000 +0400 -@@ -259,7 +259,7 @@ long ppc_newuname(struct new_utsname __u - int err = 0; - - down_read(&uts_sem); -- if (copy_to_user(name, &system_utsname, sizeof(*name))) -+ if (copy_to_user(name, &ve_utsname, sizeof(*name))) - err = -EFAULT; - up_read(&uts_sem); - if (!err) -@@ -272,7 +272,7 @@ int sys_uname(struct old_utsname __user - int err = 0; - - down_read(&uts_sem); -- if (copy_to_user(name, &system_utsname, sizeof(*name))) -+ if (copy_to_user(name, &ve_utsname, sizeof(*name))) - err = -EFAULT; - up_read(&uts_sem); - if (!err) -@@ -288,19 +288,19 @@ int sys_olduname(struct oldold_utsname _ - return -EFAULT; - - down_read(&uts_sem); -- error = __copy_to_user(&name->sysname, &system_utsname.sysname, -+ error = __copy_to_user(&name->sysname, &ve_utsname.sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); -- error |= __copy_to_user(&name->nodename, &system_utsname.nodename, -+ error |= __copy_to_user(&name->nodename, &ve_utsname.nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); -- error |= __copy_to_user(&name->release, &system_utsname.release, -+ error |= __copy_to_user(&name->release, &ve_utsname.release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); -- error |= __copy_to_user(&name->version, &system_utsname.version, -+ error |= __copy_to_user(&name->version, &ve_utsname.version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); -- error |= __copy_to_user(&name->machine, &system_utsname.machine, -+ error |= __copy_to_user(&name->machine, &ve_utsname.machine, - __OLD_UTS_LEN); - error |= override_machine(name->machine); - up_read(&uts_sem); -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/systbl.S linux-2.6.16-026test009/arch/powerpc/kernel/systbl.S ---- linux-2.6.16.orig/arch/powerpc/kernel/systbl.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/systbl.S 2006-04-19 15:02:11.000000000 +0400 -@@ -322,3 +322,12 @@ SYSCALL(spu_create) - COMPAT_SYS(pselect6) - COMPAT_SYS(ppoll) - SYSCALL(unshare) -+ -+.rept 410 - (. - sys_call_table)/8 -+SYSX(sys_ni_syscall, sys_ni_syscall, sys_ni_syscall) -+.endr -+ -+SYSX(sys_getluid, sys_ni_syscall, sys_getluid) -+SYSX(sys_setluid, sys_ni_syscall, sys_setluid) -+SYSX(sys_setublimit, sys_ni_syscall, sys_setublimit) -+SYSX(sys_ubstat, sys_ni_syscall, sys_ubstat) -diff -upr linux-2.6.16.orig/arch/powerpc/kernel/time.c linux-2.6.16-026test009/arch/powerpc/kernel/time.c ---- linux-2.6.16.orig/arch/powerpc/kernel/time.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/kernel/time.c 2006-04-19 15:02:12.000000000 +0400 -@@ -431,12 +431,14 @@ void timer_interrupt(struct pt_regs * re - int next_dec; - int cpu = smp_processor_id(); - unsigned long ticks; -+ struct ve_struct *ve; - - #ifdef CONFIG_PPC32 - if (atomic_read(&ppc_n_lost_interrupts) != 0) - do_IRQ(regs); - #endif - -+ ve = set_exec_env(get_ve0()); - irq_enter(); - - profile_tick(CPU_PROFILING, regs); -@@ -496,6 +498,7 @@ void timer_interrupt(struct pt_regs * re - #endif - - irq_exit(); -+ (void)set_exec_env(ve); - } - - void wakeup_decrementer(void) -diff -upr linux-2.6.16.orig/arch/powerpc/mm/fault.c linux-2.6.16-026test009/arch/powerpc/mm/fault.c ---- linux-2.6.16.orig/arch/powerpc/mm/fault.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/mm/fault.c 2006-04-19 15:02:12.000000000 +0400 -@@ -307,7 +307,6 @@ good_area: - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ -- survive: - switch (handle_mm_fault(mm, vma, address, is_write)) { - - case VM_FAULT_MINOR: -@@ -351,14 +350,12 @@ bad_area_nosemaphore: - */ - out_of_memory: - up_read(&mm->mmap_sem); -- if (current->pid == 1) { -- yield(); -- down_read(&mm->mmap_sem); -- goto survive; -- } -- printk("VM: killing process %s\n", current->comm); - if (user_mode(regs)) -- do_exit(SIGKILL); -+ /* -+ * 0-order allocation always success if something really -+ * fatal not happen: beancounter overdraft or OOM. Den -+ */ -+ force_sig(SIGKILL, current); - return SIGKILL; - - do_sigbus: -diff -upr linux-2.6.16.orig/arch/powerpc/mm/init_64.c linux-2.6.16-026test009/arch/powerpc/mm/init_64.c ---- linux-2.6.16.orig/arch/powerpc/mm/init_64.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/mm/init_64.c 2006-04-19 15:02:11.000000000 +0400 -@@ -225,7 +225,8 @@ void pgtable_cache_init(void) - pgtable_cache[i] = kmem_cache_create(name, - size, size, - SLAB_HWCACHE_ALIGN | -- SLAB_MUST_HWCACHE_ALIGN, -+ SLAB_MUST_HWCACHE_ALIGN | -+ SLAB_UBC | SLAB_NO_CHARGE, - zero_ctor, - NULL); - if (! pgtable_cache[i]) -diff -upr linux-2.6.16.orig/arch/powerpc/mm/mem.c linux-2.6.16-026test009/arch/powerpc/mm/mem.c ---- linux-2.6.16.orig/arch/powerpc/mm/mem.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/mm/mem.c 2006-04-19 15:02:12.000000000 +0400 -@@ -222,6 +222,7 @@ void show_mem(void) - printk("%ld pages shared\n", shared); - printk("%ld pages swap cached\n", cached); - } -+EXPORT_SYMBOL(show_mem); - - /* - * Initialize the bootmem system and give it all the memory we -diff -upr linux-2.6.16.orig/arch/powerpc/mm/pgtable_32.c linux-2.6.16-026test009/arch/powerpc/mm/pgtable_32.c ---- linux-2.6.16.orig/arch/powerpc/mm/pgtable_32.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/powerpc/mm/pgtable_32.c 2006-04-19 15:02:11.000000000 +0400 -@@ -85,7 +85,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) - { - pgd_t *ret; - -- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); -+ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | -+ __GFP_ZERO, PGDIR_ORDER); - return ret; - } - -@@ -119,6 +120,7 @@ struct page *pte_alloc_one(struct mm_str - #else - gfp_t flags = GFP_KERNEL | __GFP_REPEAT; - #endif -+ flags |= (__GFP_UBC | __GFP_SOFT_UBC); - - ptepage = alloc_pages(flags, 0); - if (ptepage) -diff -upr linux-2.6.16.orig/arch/ppc/Kconfig linux-2.6.16-026test009/arch/ppc/Kconfig ---- linux-2.6.16.orig/arch/ppc/Kconfig 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ppc/Kconfig 2006-04-19 15:02:12.000000000 +0400 -@@ -1394,6 +1394,10 @@ source "arch/powerpc/oprofile/Kconfig" - - source "arch/ppc/Kconfig.debug" - -+source "kernel/Kconfig.openvz" -+ - source "security/Kconfig" - -+source "kernel/ub/Kconfig" -+ - source "crypto/Kconfig" -diff -upr linux-2.6.16.orig/arch/ppc/kernel/misc.S linux-2.6.16-026test009/arch/ppc/kernel/misc.S ---- linux-2.6.16.orig/arch/ppc/kernel/misc.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ppc/kernel/misc.S 2006-04-19 15:02:12.000000000 +0400 -@@ -1004,7 +1004,7 @@ _GLOBAL(_get_SP) - * Create a kernel thread - * kernel_thread(fn, arg, flags) - */ --_GLOBAL(kernel_thread) -+_GLOBAL(ppc_kernel_thread) - stwu r1,-16(r1) - stw r30,8(r1) - stw r31,12(r1) -diff -upr linux-2.6.16.orig/arch/ppc/kernel/time.c linux-2.6.16-026test009/arch/ppc/kernel/time.c ---- linux-2.6.16.orig/arch/ppc/kernel/time.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ppc/kernel/time.c 2006-04-19 15:02:12.000000000 +0400 -@@ -58,6 +58,8 @@ - #include <linux/init.h> - #include <linux/profile.h> - -+#include <ub/beancounter.h> -+ - #include <asm/io.h> - #include <asm/nvram.h> - #include <asm/cache.h> -@@ -136,10 +138,14 @@ void timer_interrupt(struct pt_regs * re - unsigned long cpu = smp_processor_id(); - unsigned jiffy_stamp = last_jiffy_stamp(cpu); - extern void do_IRQ(struct pt_regs *); -+ struct ve_struct *ve; -+ struct user_beancounter *ub; - - if (atomic_read(&ppc_n_lost_interrupts) != 0) - do_IRQ(regs); - -+ ve = set_exec_env(get_ve0()); -+ ub = set_exec_ub(get_ub0()); - irq_enter(); - - while ((next_dec = tb_ticks_per_jiffy - tb_delta(&jiffy_stamp)) <= 0) { -@@ -192,6 +198,8 @@ void timer_interrupt(struct pt_regs * re - ppc_md.heartbeat(); - - irq_exit(); -+ (void)set_exec_ub(ub); -+ (void)set_exec_env(ve); - } - - /* -diff -upr linux-2.6.16.orig/arch/ppc/mm/fault.c linux-2.6.16-026test009/arch/ppc/mm/fault.c ---- linux-2.6.16.orig/arch/ppc/mm/fault.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ppc/mm/fault.c 2006-04-19 15:02:12.000000000 +0400 -@@ -247,7 +247,6 @@ good_area: - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ -- survive: - switch (handle_mm_fault(mm, vma, address, is_write)) { - case VM_FAULT_MINOR: - current->min_flt++; -@@ -290,14 +289,12 @@ bad_area: - */ - out_of_memory: - up_read(&mm->mmap_sem); -- if (current->pid == 1) { -- yield(); -- down_read(&mm->mmap_sem); -- goto survive; -- } -- printk("VM: killing process %s\n", current->comm); - if (user_mode(regs)) -- do_exit(SIGKILL); -+ /* -+ * 0-order allocation always success if something really -+ * fatal not happen: beancounter overdraft or OOM. Den -+ */ -+ force_sig(SIGKILL, current); - return SIGKILL; - - do_sigbus: -diff -upr linux-2.6.16.orig/arch/ppc/mm/init.c linux-2.6.16-026test009/arch/ppc/mm/init.c ---- linux-2.6.16.orig/arch/ppc/mm/init.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ppc/mm/init.c 2006-04-19 15:02:12.000000000 +0400 -@@ -132,6 +132,7 @@ void show_mem(void) - printk("%d pages shared\n",shared); - printk("%d pages swap cached\n",cached); - } -+EXPORT_SYMBOL(show_mem); - - /* Free up now-unused memory */ - static void free_sec(unsigned long start, unsigned long end, const char *name) -diff -upr linux-2.6.16.orig/arch/ppc/mm/pgtable.c linux-2.6.16-026test009/arch/ppc/mm/pgtable.c ---- linux-2.6.16.orig/arch/ppc/mm/pgtable.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/ppc/mm/pgtable.c 2006-04-19 15:02:11.000000000 +0400 -@@ -84,7 +84,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) - { - pgd_t *ret; - -- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); -+ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | -+ __GFP_ZERO, PGDIR_ORDER); - return ret; - } - -@@ -118,6 +119,7 @@ struct page *pte_alloc_one(struct mm_str - #else - gfp_t flags = GFP_KERNEL | __GFP_REPEAT; - #endif -+ flags |= (__GFP_UBC | __GFP_SOFT_UBC); - - ptepage = alloc_pages(flags, 0); - if (ptepage) -diff -upr linux-2.6.16.orig/arch/s390/Kconfig linux-2.6.16-026test009/arch/s390/Kconfig ---- linux-2.6.16.orig/arch/s390/Kconfig 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/s390/Kconfig 2006-04-19 15:02:12.000000000 +0400 -@@ -472,8 +472,12 @@ source "arch/s390/oprofile/Kconfig" - - source "arch/s390/Kconfig.debug" - -+source "kernel/Kconfig.openvz" -+ - source "security/Kconfig" - - source "crypto/Kconfig" - - source "lib/Kconfig" -+ -+source "kernel/ub/Kconfig" -diff -upr linux-2.6.16.orig/arch/s390/kernel/process.c linux-2.6.16-026test009/arch/s390/kernel/process.c ---- linux-2.6.16.orig/arch/s390/kernel/process.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/s390/kernel/process.c 2006-04-19 15:02:12.000000000 +0400 -@@ -164,9 +164,10 @@ void show_regs(struct pt_regs *regs) - struct task_struct *tsk = current; - - printk("CPU: %d %s\n", task_thread_info(tsk)->cpu, print_tainted()); -- printk("Process %s (pid: %d, task: %p, ksp: %p)\n", -- current->comm, current->pid, (void *) tsk, -- (void *) tsk->thread.ksp); -+ printk("Process %s (pid: %d, veid: %d, task: %p, ksp: %p)\n", -+ current->comm, current->pid, -+ VEID(VE_TASK_INFO(current)->owner_env), -+ (void *) tsk, (void *) tsk->thread.ksp); - - show_registers(regs); - /* Show stack backtrace if pt_regs is from kernel mode */ -@@ -187,6 +188,13 @@ int kernel_thread(int (*fn)(void *), voi - { - struct pt_regs regs; - -+ if (!ve_is_super(get_exec_env())) { -+ /* Don't allow kernel_thread() inside VE */ -+ printk("kernel_thread call inside VE\n"); -+ dump_stack(); -+ return -EPERM; -+ } -+ - memset(®s, 0, sizeof(regs)); - regs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | PSW_MASK_EXT; - regs.psw.addr = (unsigned long) kernel_thread_starter | PSW_ADDR_AMODE; -diff -upr linux-2.6.16.orig/arch/s390/kernel/s390_ext.c linux-2.6.16-026test009/arch/s390/kernel/s390_ext.c ---- linux-2.6.16.orig/arch/s390/kernel/s390_ext.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/s390/kernel/s390_ext.c 2006-04-19 15:02:12.000000000 +0400 -@@ -114,7 +114,9 @@ void do_extint(struct pt_regs *regs, uns - { - ext_int_info_t *p; - int index; -+ struct ve_struct *envid; - -+ envid = set_exec_env(get_ve0()); - irq_enter(); - asm volatile ("mc 0,0"); - if (S390_lowcore.int_clock >= S390_lowcore.jiffy_timer) -@@ -132,6 +134,7 @@ void do_extint(struct pt_regs *regs, uns - } - } - irq_exit(); -+ (void)set_exec_env(envid); - } - - EXPORT_SYMBOL(register_external_interrupt); -diff -upr linux-2.6.16.orig/arch/s390/kernel/smp.c linux-2.6.16-026test009/arch/s390/kernel/smp.c ---- linux-2.6.16.orig/arch/s390/kernel/smp.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/s390/kernel/smp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -526,6 +526,17 @@ int __devinit start_secondary(void *cpuv - { - /* Setup the cpu */ - cpu_init(); -+ -+#ifdef CONFIG_VE -+ /* TSC reset. kill whatever might rely on old values */ -+ VE_TASK_INFO(current)->wakeup_stamp = 0; -+ /* -+ * Cosmetic: sleep_time won't be changed afterwards for the idle -+ * thread; keep it 0 rather than -cycles. -+ */ -+ VE_TASK_INFO(idle)->sleep_time = 0; -+#endif -+ - preempt_disable(); - /* init per CPU timer */ - init_cpu_timer(); -@@ -834,6 +845,11 @@ void __init smp_prepare_cpus(unsigned in - for_each_cpu(cpu) - if (cpu != smp_processor_id()) - smp_create_idle(cpu); -+ -+#ifdef CONFIG_VE -+ /* TSC reset. kill whatever might rely on old values */ -+ VE_TASK_INFO(current)->wakeup_stamp = 0; -+#endif - } - - void __devinit smp_prepare_boot_cpu(void) -diff -upr linux-2.6.16.orig/arch/s390/kernel/syscalls.S linux-2.6.16-026test009/arch/s390/kernel/syscalls.S ---- linux-2.6.16.orig/arch/s390/kernel/syscalls.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/s390/kernel/syscalls.S 2006-04-19 15:02:11.000000000 +0400 -@@ -312,3 +312,12 @@ SYSCALL(sys_faccessat,sys_faccessat,sys_ - SYSCALL(sys_pselect6,sys_pselect6,compat_sys_pselect6_wrapper) - SYSCALL(sys_ppoll,sys_ppoll,compat_sys_ppoll_wrapper) - SYSCALL(sys_unshare,sys_unshare,sys_unshare_wrapper) -+ -+.rept 410-(.-sys_call_table)/4 -+ NI_SYSCALL -+.endr -+ -+SYSCALL(sys_getluid, sys_getluid, sys_ni_syscall) /* 410 */ -+SYSCALL(sys_setluid, sys_setluid, sys_ni_syscall) -+SYSCALL(sys_setublimit, sys_setublimit, sys_ni_syscall) -+SYSCALL(sys_ubstat, sys_ubstat, sys_ni_syscall) -diff -upr linux-2.6.16.orig/arch/s390/mm/fault.c linux-2.6.16-026test009/arch/s390/mm/fault.c ---- linux-2.6.16.orig/arch/s390/mm/fault.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/s390/mm/fault.c 2006-04-19 15:02:11.000000000 +0400 -@@ -61,17 +61,9 @@ void bust_spinlocks(int yes) - if (yes) { - oops_in_progress = 1; - } else { -- int loglevel_save = console_loglevel; - console_unblank(); - oops_in_progress = 0; -- /* -- * OK, the message is on the console. Now we call printk() -- * without oops_in_progress set so that printk will give klogd -- * a poke. Hold onto your hats... -- */ -- console_loglevel = 15; -- printk(" "); -- console_loglevel = loglevel_save; -+ wake_up_klogd(); - } - } - -diff -upr linux-2.6.16.orig/arch/s390/mm/init.c linux-2.6.16-026test009/arch/s390/mm/init.c ---- linux-2.6.16.orig/arch/s390/mm/init.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/s390/mm/init.c 2006-04-19 15:02:12.000000000 +0400 -@@ -89,6 +89,7 @@ void show_mem(void) - printk("%d pages shared\n",shared); - printk("%d pages swap cached\n",cached); - } -+EXPORT_SYMBOL(show_mem); - - /* References to section boundaries */ - -diff -upr linux-2.6.16.orig/arch/sh/kernel/kgdb_stub.c linux-2.6.16-026test009/arch/sh/kernel/kgdb_stub.c ---- linux-2.6.16.orig/arch/sh/kernel/kgdb_stub.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/sh/kernel/kgdb_stub.c 2006-04-19 15:02:12.000000000 +0400 -@@ -412,7 +412,7 @@ static struct task_struct *get_thread(in - if (pid == PID_MAX) pid = 0; - - /* First check via PID */ -- thread = find_task_by_pid(pid); -+ thread = find_task_by_pid_all(pid); - - if (thread) - return thread; -diff -upr linux-2.6.16.orig/arch/sh64/kernel/process.c linux-2.6.16-026test009/arch/sh64/kernel/process.c ---- linux-2.6.16.orig/arch/sh64/kernel/process.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/sh64/kernel/process.c 2006-04-19 15:02:12.000000000 +0400 -@@ -906,7 +906,7 @@ asids_proc_info(char *buf, char **start, - int len=0; - struct task_struct *p; - read_lock(&tasklist_lock); -- for_each_process(p) { -+ for_each_process_ve(p) { - int pid = p->pid; - struct mm_struct *mm; - if (!pid) continue; -diff -upr linux-2.6.16.orig/arch/sparc64/kernel/setup.c linux-2.6.16-026test009/arch/sparc64/kernel/setup.c ---- linux-2.6.16.orig/arch/sparc64/kernel/setup.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/sparc64/kernel/setup.c 2006-04-19 15:02:12.000000000 +0400 -@@ -156,7 +156,7 @@ int prom_callback(long *args) - pte_t *ptep; - pte_t pte; - -- for_each_process(p) { -+ for_each_process_all(p) { - mm = p->mm; - if (CTX_NRBITS(mm->context) == ctx) - break; -diff -upr linux-2.6.16.orig/arch/um/drivers/mconsole_kern.c linux-2.6.16-026test009/arch/um/drivers/mconsole_kern.c ---- linux-2.6.16.orig/arch/um/drivers/mconsole_kern.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/um/drivers/mconsole_kern.c 2006-04-19 15:02:12.000000000 +0400 -@@ -600,7 +600,7 @@ static void do_stack_trace(struct mc_req - - from = current; - -- to = find_task_by_pid(pid_requested); -+ to = find_task_by_pid_all(pid_requested); - if((to == NULL) || (pid_requested == 0)) { - mconsole_reply(req, "Couldn't find that pid", 1, 0); - return; -diff -upr linux-2.6.16.orig/arch/um/kernel/skas/process_kern.c linux-2.6.16-026test009/arch/um/kernel/skas/process_kern.c ---- linux-2.6.16.orig/arch/um/kernel/skas/process_kern.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/um/kernel/skas/process_kern.c 2006-04-19 15:02:12.000000000 +0400 -@@ -197,7 +197,7 @@ void kill_off_processes_skas(void) - int pid, me; - - me = os_getpid(); -- for_each_process(p){ -+ for_each_process_all(p){ - if(p->mm == NULL) - continue; - -diff -upr linux-2.6.16.orig/arch/um/kernel/tt/process_kern.c linux-2.6.16-026test009/arch/um/kernel/tt/process_kern.c ---- linux-2.6.16.orig/arch/um/kernel/tt/process_kern.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/um/kernel/tt/process_kern.c 2006-04-19 15:02:12.000000000 +0400 -@@ -301,7 +301,7 @@ void kill_off_processes_tt(void) - int me; - - me = os_getpid(); -- for_each_process(p){ -+ for_each_process_all(p){ - if(p->thread.mode.tt.extern_pid != me) - os_kill_process(p->thread.mode.tt.extern_pid, 0); - } -@@ -444,7 +444,7 @@ int is_valid_pid(int pid) - struct task_struct *task; - - read_lock(&tasklist_lock); -- for_each_process(task){ -+ for_each_process_all(task){ - if(task->thread.mode.tt.extern_pid == pid){ - read_unlock(&tasklist_lock); - return(1); -diff -upr linux-2.6.16.orig/arch/x86_64/Kconfig linux-2.6.16-026test009/arch/x86_64/Kconfig ---- linux-2.6.16.orig/arch/x86_64/Kconfig 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/Kconfig 2006-04-19 15:02:12.000000000 +0400 -@@ -588,8 +588,12 @@ endmenu - - source "arch/x86_64/Kconfig.debug" - -+source "kernel/Kconfig.openvz" -+ - source "security/Kconfig" - - source "crypto/Kconfig" - - source "lib/Kconfig" -+ -+source "kernel/ub/Kconfig" -diff -upr linux-2.6.16.orig/arch/x86_64/boot/compressed/head.S linux-2.6.16-026test009/arch/x86_64/boot/compressed/head.S ---- linux-2.6.16.orig/arch/x86_64/boot/compressed/head.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/boot/compressed/head.S 2006-04-19 15:02:12.000000000 +0400 -@@ -34,7 +34,7 @@ - startup_32: - cld - cli -- movl $(__KERNEL_DS),%eax -+ movl $(__BOOT_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs -@@ -76,7 +76,7 @@ startup_32: - jnz 3f - addl $8,%esp - xorl %ebx,%ebx -- ljmp $(__KERNEL_CS), $__PHYSICAL_START -+ ljmp $(__BOOT_CS), $__PHYSICAL_START - - /* - * We come here, if we were loaded high. -@@ -104,7 +104,7 @@ startup_32: - popl %eax # hcount - movl $__PHYSICAL_START,%edi - cli # make sure we don't get interrupted -- ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine -+ ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine - - /* - * Routine (template) for moving the decompressed kernel in place, -@@ -127,7 +127,7 @@ move_routine_start: - movsl - movl %ebx,%esi # Restore setup pointer - xorl %ebx,%ebx -- ljmp $(__KERNEL_CS), $__PHYSICAL_START -+ ljmp $(__BOOT_CS), $__PHYSICAL_START - move_routine_end: - - -@@ -137,5 +137,5 @@ user_stack: - .fill 4096,4,0 - stack_start: - .long user_stack+4096 -- .word __KERNEL_DS -+ .word __BOOT_DS - -diff -upr linux-2.6.16.orig/arch/x86_64/boot/setup.S linux-2.6.16-026test009/arch/x86_64/boot/setup.S ---- linux-2.6.16.orig/arch/x86_64/boot/setup.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/boot/setup.S 2006-04-19 15:02:12.000000000 +0400 -@@ -729,7 +729,7 @@ flush_instr: - subw $DELTA_INITSEG, %si - shll $4, %esi # Convert to 32-bit pointer - # NOTE: For high loaded big kernels we need a --# jmpi 0x100000,__KERNEL_CS -+# jmpi 0x100000,__BOOT_CS - # - # but we yet haven't reloaded the CS register, so the default size - # of the target offset still is 16 bit. -@@ -740,7 +740,7 @@ flush_instr: - .byte 0x66, 0xea # prefix + jmpi-opcode - code32: .long 0x1000 # will be set to 0x100000 - # for big kernels -- .word __KERNEL_CS -+ .word __BOOT_CS - - # Here's a bunch of information about your current kernel.. - kernel_version: .ascii UTS_RELEASE -diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_aout.c linux-2.6.16-026test009/arch/x86_64/ia32/ia32_aout.c ---- linux-2.6.16.orig/arch/x86_64/ia32/ia32_aout.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/ia32/ia32_aout.c 2006-04-19 15:02:12.000000000 +0400 -@@ -347,14 +347,14 @@ static int load_aout_binary(struct linux - if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) - { -- printk(KERN_NOTICE "executable not page aligned\n"); -+ ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n"); - error_time2 = jiffies; - } - - if ((fd_offset & ~PAGE_MASK) != 0 && - (jiffies-error_time) > 5*HZ) - { -- printk(KERN_WARNING -+ ve_printk(VE_LOG, KERN_WARNING - "fd_offset is not page aligned. Please convert program: %s\n", - bprm->file->f_dentry->d_name.name); - error_time = jiffies; -@@ -467,7 +467,7 @@ static int load_aout_library(struct file - static unsigned long error_time; - if ((jiffies-error_time) > 5*HZ) - { -- printk(KERN_WARNING -+ ve_printk(VE_LOG, KERN_WARNING - "N_TXTOFF is not page aligned. Please convert library: %s\n", - file->f_dentry->d_name.name); - error_time = jiffies; -diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_binfmt.c linux-2.6.16-026test009/arch/x86_64/ia32/ia32_binfmt.c ---- linux-2.6.16.orig/arch/x86_64/ia32/ia32_binfmt.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/ia32/ia32_binfmt.c 2006-04-19 15:02:12.000000000 +0400 -@@ -27,12 +27,14 @@ - #include <asm/ia32.h> - #include <asm/vsyscall32.h> - -+#include <ub/ub_vmpages.h> -+ - #define ELF_NAME "elf/i386" - - #define AT_SYSINFO 32 - #define AT_SYSINFO_EHDR 33 - --int sysctl_vsyscall32 = 1; -+int sysctl_vsyscall32 = 0; - - #define ARCH_DLINFO do { \ - if (sysctl_vsyscall32) { \ -@@ -347,9 +349,15 @@ int ia32_setup_arg_pages(struct linux_bi - bprm->loader += stack_base; - bprm->exec += stack_base; - -+ ret = -ENOMEM; -+ if (ub_memory_charge(mm, IA32_STACK_TOP - -+ (PAGE_MASK & (unsigned long)bprm->p), -+ VM_STACK_FLAGS, NULL, UB_SOFT)) -+ goto err_charge; -+ - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!mpnt) -- return -ENOMEM; -+ goto err_alloc; - - memset(mpnt, 0, sizeof(*mpnt)); - -@@ -366,11 +374,8 @@ int ia32_setup_arg_pages(struct linux_bi - mpnt->vm_flags = VM_STACK_FLAGS; - mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? - PAGE_COPY_EXEC : PAGE_COPY; -- if ((ret = insert_vm_struct(mm, mpnt))) { -- up_write(&mm->mmap_sem); -- kmem_cache_free(vm_area_cachep, mpnt); -- return ret; -- } -+ if ((ret = insert_vm_struct(mm, mpnt))) -+ goto err_insert; - mm->stack_vm = mm->total_vm = vma_pages(mpnt); - } - -@@ -385,6 +390,16 @@ int ia32_setup_arg_pages(struct linux_bi - up_write(&mm->mmap_sem); - - return 0; -+ -+err_insert: -+ up_write(&mm->mmap_sem); -+ kmem_cache_free(vm_area_cachep, mpnt); -+err_alloc: -+ ub_memory_uncharge(mm, IA32_STACK_TOP - -+ (PAGE_MASK & (unsigned long)bprm->p), -+ VM_STACK_FLAGS, NULL); -+err_charge: -+ return ret; - } - EXPORT_SYMBOL(ia32_setup_arg_pages); - -diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_signal.c linux-2.6.16-026test009/arch/x86_64/ia32/ia32_signal.c ---- linux-2.6.16.orig/arch/x86_64/ia32/ia32_signal.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/ia32/ia32_signal.c 2006-04-19 15:02:12.000000000 +0400 -@@ -39,7 +39,6 @@ - - #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - --asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); - void signal_fault(struct pt_regs *regs, void __user *frame, char *where); - - int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) -@@ -118,22 +117,17 @@ asmlinkage long - sys32_sigsuspend(int history0, int history1, old_sigset_t mask, - struct pt_regs *regs) - { -- sigset_t saveset; -- - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); -- saveset = current->blocked; -+ current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - -- regs->rax = -EINTR; -- while (1) { -- current->state = TASK_INTERRUPTIBLE; -- schedule(); -- if (do_signal(regs, &saveset)) -- return -EINTR; -- } -+ current->state = TASK_INTERRUPTIBLE; -+ schedule(); -+ set_thread_flag(TIF_RESTORE_SIGMASK); -+ return -ERESTARTNOHAND; - } - - asmlinkage long -@@ -510,11 +504,11 @@ int ia32_setup_frame(int sig, struct k_s - current->comm, current->pid, frame, regs->rip, frame->pretcode); - #endif - -- return 1; -+ return 0; - - give_sigsegv: - force_sigsegv(sig, current); -- return 0; -+ return -EFAULT; - } - - int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, -@@ -606,9 +600,9 @@ int ia32_setup_rt_frame(int sig, struct - current->comm, current->pid, frame, regs->rip, frame->pretcode); - #endif - -- return 1; -+ return 0; - - give_sigsegv: - force_sigsegv(sig, current); -- return 0; -+ return -EFAULT; - } -diff -upr linux-2.6.16.orig/arch/x86_64/ia32/sys_ia32.c linux-2.6.16-026test009/arch/x86_64/ia32/sys_ia32.c ---- linux-2.6.16.orig/arch/x86_64/ia32/sys_ia32.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/ia32/sys_ia32.c 2006-04-19 15:02:12.000000000 +0400 -@@ -527,7 +527,7 @@ int sys32_ni_syscall(int call) - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { -- printk(KERN_INFO "IA32 syscall %d from %s not implemented\n", -+ ve_printk(VE_LOG, KERN_INFO "IA32 syscall %d from %s not implemented\n", - call, me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } -@@ -890,13 +890,13 @@ asmlinkage long sys32_olduname(struct ol - - down_read(&uts_sem); - -- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); -+ error = __copy_to_user(&name->sysname,&ve_utsname.sysname,__OLD_UTS_LEN); - __put_user(0,name->sysname+__OLD_UTS_LEN); -- __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); -+ __copy_to_user(&name->nodename,&ve_utsname.nodename,__OLD_UTS_LEN); - __put_user(0,name->nodename+__OLD_UTS_LEN); -- __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); -+ __copy_to_user(&name->release,&ve_utsname.release,__OLD_UTS_LEN); - __put_user(0,name->release+__OLD_UTS_LEN); -- __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); -+ __copy_to_user(&name->version,&ve_utsname.version,__OLD_UTS_LEN); - __put_user(0,name->version+__OLD_UTS_LEN); - { - char *arch = "x86_64"; -@@ -919,7 +919,7 @@ long sys32_uname(struct old_utsname __us - if (!name) - return -EFAULT; - down_read(&uts_sem); -- err=copy_to_user(name, &system_utsname, sizeof (*name)); -+ err=copy_to_user(name, &ve_utsname, sizeof (*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); -@@ -1005,7 +1005,7 @@ long sys32_vm86_warning(void) - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { -- printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", -+ ve_printk(VE_LOG, KERN_INFO "%s: vm87 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } -diff -upr linux-2.6.16.orig/arch/x86_64/ia32/syscall32.c linux-2.6.16-026test009/arch/x86_64/ia32/syscall32.c ---- linux-2.6.16.orig/arch/x86_64/ia32/syscall32.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/ia32/syscall32.c 2006-04-19 15:02:11.000000000 +0400 -@@ -14,6 +14,8 @@ - #include <asm/tlbflush.h> - #include <asm/ia32_unistd.h> - -+#include <ub/ub_vmpages.h> -+ - extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; - extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; - extern int sysctl_vsyscall32; -@@ -47,32 +49,45 @@ int syscall32_setup_pages(struct linux_b - int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT; - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; -+ unsigned long flags; - int ret; - -+ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | -+ mm->def_flags; -+ -+ ret = -ENOMEM; -+ if (ub_memory_charge(mm, VSYSCALL32_END - VSYSCALL32_BASE, -+ flags, NULL, UB_SOFT)) -+ goto err_charge; -+ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!vma) -- return -ENOMEM; -+ goto err_alloc; - - memset(vma, 0, sizeof(struct vm_area_struct)); - /* Could randomize here */ - vma->vm_start = VSYSCALL32_BASE; - vma->vm_end = VSYSCALL32_END; - /* MAYWRITE to allow gdb to COW and set breakpoints */ -- vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE; -- vma->vm_flags |= mm->def_flags; -+ vma->vm_flags = flags; - vma->vm_page_prot = protection_map[vma->vm_flags & 7]; - vma->vm_ops = &syscall32_vm_ops; - vma->vm_mm = mm; - - down_write(&mm->mmap_sem); -- if ((ret = insert_vm_struct(mm, vma))) { -- up_write(&mm->mmap_sem); -- kmem_cache_free(vm_area_cachep, vma); -- return ret; -- } -+ if ((ret = insert_vm_struct(mm, vma))) -+ goto err_ins; - mm->total_vm += npages; - up_write(&mm->mmap_sem); - return 0; -+ -+err_ins: -+ up_write(&mm->mmap_sem); -+ kmem_cache_free(vm_area_cachep, vma); -+err_alloc: -+ ub_memory_uncharge(mm, VSYSCALL32_END - VSYSCALL32_BASE, flags, NULL); -+err_charge: -+ return ret; - } - - static int __init init_syscall32(void) -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/acpi/wakeup.S linux-2.6.16-026test009/arch/x86_64/kernel/acpi/wakeup.S ---- linux-2.6.16.orig/arch/x86_64/kernel/acpi/wakeup.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/acpi/wakeup.S 2006-04-19 15:02:12.000000000 +0400 -@@ -77,7 +77,7 @@ wakeup_code: - - .byte 0x66, 0xea # prefix + jmpi-opcode - .long wakeup_32 - __START_KERNEL_map -- .word __KERNEL_CS -+ .word __BOOT_CS - - .code32 - wakeup_32: -@@ -96,13 +96,13 @@ wakeup_32: - jnc bogus_cpu - movl %edx,%edi - -- movw $__KERNEL_DS, %ax -+ movw $__BOOT_DS, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs - -- movw $__KERNEL_DS, %ax -+ movw $__BOOT_DS, %ax - movw %ax, %ss - - mov $(wakeup_stack - __START_KERNEL_map), %esp -@@ -187,7 +187,7 @@ reach_compatibility_mode: - - wakeup_jumpvector: - .long wakeup_long64 - __START_KERNEL_map -- .word __KERNEL_CS -+ .word __BOOT_CS - - .code64 - -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/apic.c linux-2.6.16-026test009/arch/x86_64/kernel/apic.c ---- linux-2.6.16.orig/arch/x86_64/kernel/apic.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/apic.c 2006-04-19 15:02:12.000000000 +0400 -@@ -941,6 +941,7 @@ void smp_local_timer_interrupt(struct pt - */ - void smp_apic_timer_interrupt(struct pt_regs *regs) - { -+ struct ve_struct *ve; - /* - * the NMI deadlock-detector uses this. - */ -@@ -957,9 +958,11 @@ void smp_apic_timer_interrupt(struct pt_ - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); -+ ve = set_exec_env(get_ve0()); - irq_enter(); - smp_local_timer_interrupt(regs); - irq_exit(); -+ (void)set_exec_env(ve); - } - - /* -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/entry.S linux-2.6.16-026test009/arch/x86_64/kernel/entry.S ---- linux-2.6.16.orig/arch/x86_64/kernel/entry.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/entry.S 2006-04-19 15:02:12.000000000 +0400 -@@ -180,6 +180,10 @@ rff_trace: - * - * XXX if we had a free scratch register we could save the RSP into the stack frame - * and report it properly in ps. Unfortunately we haven't. -+ * -+ * When user can change the frames always force IRET. That is because -+ * it deals with uncanonical addresses better. SYSRET has trouble -+ * with them due to bugs in both AMD and Intel CPUs. - */ - - ENTRY(system_call) -@@ -244,7 +248,7 @@ sysret_careful: - /* Handle a signal */ - sysret_signal: - sti -- testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx -+ testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx - jz 1f - - /* Really a signal */ -@@ -254,7 +258,10 @@ sysret_signal: - xorl %esi,%esi # oldset -> arg2 - call ptregscall_common - 1: movl $_TIF_NEED_RESCHED,%edi -- jmp sysret_check -+ /* Use IRET because user could have changed frame. This -+ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ -+ cli -+ jmp int_with_check - - badsys: - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) -@@ -280,7 +287,8 @@ tracesys: - call syscall_trace_leave - RESTORE_TOP_OF_STACK %rbx - RESTORE_REST -- jmp ret_from_sys_call -+ /* Use IRET because user could have changed frame */ -+ jmp int_ret_from_sys_call - CFI_ENDPROC - - /* -@@ -350,7 +358,7 @@ int_very_careful: - jmp int_restore_rest - - int_signal: -- testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx -+ testl $(_TIF_NOTIFY_RESUME|_TIF_RESTORE_SIGMASK|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 -@@ -408,25 +416,9 @@ ENTRY(stub_execve) - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 - SAVE_REST -- movq %r11, %r15 -- CFI_REGISTER rip, r15 - FIXUP_TOP_OF_STACK %r11 - call sys_execve -- GET_THREAD_INFO(%rcx) -- bt $TIF_IA32,threadinfo_flags(%rcx) -- CFI_REMEMBER_STATE -- jc exec_32bit - RESTORE_TOP_OF_STACK %r11 -- movq %r15, %r11 -- CFI_REGISTER rip, r11 -- RESTORE_REST -- pushq %r11 -- CFI_ADJUST_CFA_OFFSET 8 -- CFI_REL_OFFSET rip, 0 -- ret -- --exec_32bit: -- CFI_RESTORE_STATE - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call -@@ -574,7 +566,7 @@ retint_careful: - jmp retint_check - - retint_signal: -- testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx -+ testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx - jz retint_swapgs - sti - SAVE_REST -@@ -845,7 +837,7 @@ ENTRY(kernel_thread) - xorl %r9d,%r9d - - # clone now -- call do_fork -+ call do_fork_kthread - movq %rax,RAX(%rsp) - xorl %edi,%edi - -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/head.S linux-2.6.16-026test009/arch/x86_64/kernel/head.S ---- linux-2.6.16.orig/arch/x86_64/kernel/head.S 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/head.S 2006-04-19 15:02:12.000000000 +0400 -@@ -40,7 +40,7 @@ startup_32: - */ - - /* Initialize the %ds segment register */ -- movl $__KERNEL_DS,%eax -+ movl $__BOOT_DS,%eax - movl %eax,%ds - - /* Load new GDT with the 64bit segments using 32bit descriptor */ -@@ -183,7 +183,14 @@ startup_64: - /* esi is pointer to real mode structure with interesting info. - pass it to C */ - movl %esi, %edi -- -+ -+ /* Switch to __KERNEL_CS. The segment is the same, but selector -+ * is different. */ -+ pushq $__KERNEL_CS -+ pushq $switch_cs -+ lretq -+switch_cs: -+ - /* Finally jump to run C code and to be on real kernel address - * Since we are running on identity-mapped space we have to jump - * to the full 64bit address , this is only possible as indirect -@@ -243,7 +250,7 @@ pGDT32: - .org 0xf10 - ljumpvector: - .long startup_64-__START_KERNEL_map -- .word __KERNEL_CS -+ .word __BOOT_CS - - ENTRY(stext) - ENTRY(_stext) -@@ -355,21 +362,30 @@ gdt: - .align PAGE_SIZE - - /* The TLS descriptors are currently at a different place compared to i386. -- Hopefully nobody expects them at a fixed place (Wine?) */ -+ Hopefully nobody expects them at a fixed place (Wine?) -+ Descriptors rearranged to plase 32bit and TLS selectors in the same -+ places, because it is really necessary. sysret/exit mandates order -+ of kernel/user cs/ds, so we have to extend gdt. -+*/ - - ENTRY(cpu_gdt_table) -- .quad 0x0000000000000000 /* NULL descriptor */ -- .quad 0x0 /* unused */ -- .quad 0x00af9a000000ffff /* __KERNEL_CS */ -- .quad 0x00cf92000000ffff /* __KERNEL_DS */ -- .quad 0x00cffa000000ffff /* __USER32_CS */ -- .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ -- .quad 0x00affa000000ffff /* __USER_CS */ -- .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ -- .quad 0,0 /* TSS */ -- .quad 0,0 /* LDT */ -- .quad 0,0,0 /* three TLS descriptors */ -- .quad 0 /* unused */ -+ .quad 0x0000000000000000 /* 0 NULL descriptor */ -+ .quad 0x0 /* 1 unused */ -+ .quad 0x00af9a000000ffff /* 2 __BOOT_CS */ -+ .quad 0x00cf92000000ffff /* 3 __BOOT_DS */ -+ .quad 0,0 /* 4,5 TSS */ -+ .quad 0,0,0 /* 6-8 three TLS descriptors */ -+ .quad 0,0 /* 9,10 LDT */ -+ .quad 0x00cf9a000000ffff /* 11 __KERNEL32_CS */ -+ .quad 0x00af9a000000ffff /* 12 __KERNEL_CS */ -+ .quad 0x00cf92000000ffff /* 13 __KERNEL_DS */ -+ .quad 0x00cffa000000ffff /* 14 __USER32_CS */ -+ .quad 0x00cff2000000ffff /* 15 __USER_DS, __USER32_DS */ -+ .quad 0x00affa000000ffff /* 16 __USER_CS */ -+ .quad 0x0 /* 17 unused */ -+ .quad 0,0,0,0,0,0 -+ .quad 0,0,0,0,0,0,0,0 -+ - gdt_end: - /* asm/segment.h:GDT_ENTRIES must match this */ - /* This should be a multiple of the cache line size */ -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/irq.c linux-2.6.16-026test009/arch/x86_64/kernel/irq.c ---- linux-2.6.16.orig/arch/x86_64/kernel/irq.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/irq.c 2006-04-19 15:02:12.000000000 +0400 -@@ -98,12 +98,15 @@ asmlinkage unsigned int do_IRQ(struct pt - { - /* high bits used in ret_from_ code */ - unsigned irq = regs->orig_rax & 0xff; -+ struct ve_struct *ve; - - exit_idle(); -+ ve = set_exec_env(get_ve0()); - irq_enter(); - - __do_IRQ(irq, regs); - irq_exit(); -+ (void)set_exec_env(ve); - - return 1; - } -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/ldt.c linux-2.6.16-026test009/arch/x86_64/kernel/ldt.c ---- linux-2.6.16.orig/arch/x86_64/kernel/ldt.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/ldt.c 2006-04-19 15:02:12.000000000 +0400 -@@ -16,6 +16,7 @@ - #include <linux/smp_lock.h> - #include <linux/vmalloc.h> - #include <linux/slab.h> -+#include <linux/module.h> - - #include <asm/uaccess.h> - #include <asm/system.h> -@@ -23,6 +24,8 @@ - #include <asm/desc.h> - #include <asm/proto.h> - -+#include <ub/ub_mem.h> -+ - #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ - static void flush_ldt(void *null) - { -@@ -42,9 +45,9 @@ static int alloc_ldt(mm_context_t *pc, u - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) -- newldt = vmalloc(mincount*LDT_ENTRY_SIZE); -+ newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); - else -- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); -+ newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; -@@ -109,6 +112,7 @@ int init_new_context(struct task_struct - } - return retval; - } -+EXPORT_SYMBOL_GPL(init_new_context); - - /* - * -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/nmi.c linux-2.6.16-026test009/arch/x86_64/kernel/nmi.c ---- linux-2.6.16.orig/arch/x86_64/kernel/nmi.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/nmi.c 2006-04-19 15:02:11.000000000 +0400 -@@ -522,6 +522,7 @@ static __kprobes int dummy_nmi_callback( - } - - static nmi_callback_t nmi_callback = dummy_nmi_callback; -+static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; - - asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) - { -@@ -531,9 +532,21 @@ asmlinkage __kprobes void do_nmi(struct - add_pda(__nmi_count,1); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); -+ -+ nmi_ipi_callback(regs, cpu); - nmi_exit(); - } - -+void set_nmi_ipi_callback(nmi_callback_t callback) -+{ -+ nmi_ipi_callback = callback; -+} -+ -+void unset_nmi_ipi_callback(void) -+{ -+ nmi_ipi_callback = dummy_nmi_callback; -+} -+ - void set_nmi_callback(nmi_callback_t callback) - { - rcu_assign_pointer(nmi_callback, callback); -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/process.c linux-2.6.16-026test009/arch/x86_64/kernel/process.c ---- linux-2.6.16.orig/arch/x86_64/kernel/process.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/process.c 2006-04-19 15:02:12.000000000 +0400 -@@ -54,6 +54,11 @@ - #include <asm/idle.h> - - asmlinkage extern void ret_from_fork(void); -+asmlinkage extern void int_ret_from_sys_call(void); -+asmlinkage extern void execve(void); -+EXPORT_SYMBOL_GPL(ret_from_fork); -+EXPORT_SYMBOL_GPL(int_ret_from_sys_call); -+EXPORT_SYMBOL_GPL(execve); - - unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - -@@ -303,7 +308,8 @@ void __show_regs(struct pt_regs * regs) - (int)strcspn(system_utsname.version, " "), - system_utsname.version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); -- printk_address(regs->rip); -+ if (decode_call_traces) -+ printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, - regs->eflags); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", -@@ -345,6 +351,21 @@ void show_regs(struct pt_regs *regs) - show_trace(®s->rsp); - } - -+void smp_show_regs(struct pt_regs *regs, void *data) -+{ -+ static DEFINE_SPINLOCK(show_regs_lock); -+ -+ if (regs == NULL) -+ return; -+ -+ bust_spinlocks(1); -+ spin_lock(&show_regs_lock); -+ printk("----------- IPI show regs -----------\n"); -+ show_regs(regs); -+ spin_unlock(&show_regs_lock); -+ bust_spinlocks(0); -+} -+ - /* - * Free current thread data structures etc.. - */ -@@ -841,3 +862,20 @@ unsigned long arch_align_stack(unsigned - sp -= get_random_int() % 8192; - return sp & ~0xf; - } -+ -+long do_fork_kthread(unsigned long clone_flags, -+ unsigned long stack_start, -+ struct pt_regs *regs, -+ unsigned long stack_size, -+ int __user *parent_tidptr, -+ int __user *child_tidptr) -+{ -+ if (ve_is_super(get_exec_env())) -+ return do_fork(clone_flags, stack_start, regs, stack_size, -+ parent_tidptr, child_tidptr); -+ -+ /* Don't allow kernel_thread() inside VE */ -+ printk("kernel_thread call inside VE\n"); -+ dump_stack(); -+ return -EPERM; -+} -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/ptrace.c linux-2.6.16-026test009/arch/x86_64/kernel/ptrace.c ---- linux-2.6.16.orig/arch/x86_64/kernel/ptrace.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/ptrace.c 2006-04-19 15:02:12.000000000 +0400 -@@ -300,6 +300,15 @@ static unsigned long getreg(struct task_ - return child->thread.fs; - case offsetof(struct user_regs_struct, gs_base): - return child->thread.gs; -+ case offsetof(struct user_regs_struct, cs): -+ if (test_tsk_thread_flag(child, TIF_SYSCALL_TRACE)) { -+ val = get_stack_long(child, regno - sizeof(struct pt_regs)); -+ if (val == __USER_CS) -+ return 0x33; -+ if (val == __USER32_CS) -+ return 0x23; -+ } -+ /* fall through */ - default: - regno = regno - sizeof(struct pt_regs); - val = get_stack_long(child, regno); -@@ -581,8 +590,10 @@ static void syscall_trace(struct pt_regs - current_thread_info()->flags, current->ptrace); - #endif - -+ set_pn_state(current, (regs->rax != -ENOSYS) ? PN_STOP_LEAVE : PN_STOP_ENTRY); - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); -+ clear_pn_state(current); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/setup64.c linux-2.6.16-026test009/arch/x86_64/kernel/setup64.c ---- linux-2.6.16.orig/arch/x86_64/kernel/setup64.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/setup64.c 2006-04-19 15:02:12.000000000 +0400 -@@ -290,3 +290,5 @@ void __cpuinit cpu_init (void) - - fpu_init(); - } -+ -+EXPORT_SYMBOL_GPL(cpu_gdt_descr); -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/signal.c linux-2.6.16-026test009/arch/x86_64/kernel/signal.c ---- linux-2.6.16.orig/arch/x86_64/kernel/signal.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/signal.c 2006-04-19 15:02:12.000000000 +0400 -@@ -40,37 +40,6 @@ int ia32_setup_frame(int sig, struct k_s - sigset_t *set, struct pt_regs * regs); - - asmlinkage long --sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) --{ -- sigset_t saveset, newset; -- -- /* XXX: Don't preclude handling different sized sigset_t's. */ -- if (sigsetsize != sizeof(sigset_t)) -- return -EINVAL; -- -- if (copy_from_user(&newset, unewset, sizeof(newset))) -- return -EFAULT; -- sigdelsetmask(&newset, ~_BLOCKABLE); -- -- spin_lock_irq(¤t->sighand->siglock); -- saveset = current->blocked; -- current->blocked = newset; -- recalc_sigpending(); -- spin_unlock_irq(¤t->sighand->siglock); --#ifdef DEBUG_SIG -- printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", -- saveset, newset, regs, regs->rip); --#endif -- regs->rax = -EINTR; -- while (1) { -- current->state = TASK_INTERRUPTIBLE; -- schedule(); -- if (do_signal(regs, &saveset)) -- return -EINTR; -- } --} -- --asmlinkage long - sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) - { -@@ -344,11 +313,11 @@ static int setup_rt_frame(int sig, struc - current->comm, current->pid, frame, regs->rip, frame->pretcode); - #endif - -- return 1; -+ return 0; - - give_sigsegv: - force_sigsegv(sig, current); -- return 0; -+ return -EFAULT; - } - - /* -@@ -411,7 +380,7 @@ handle_signal(unsigned long sig, siginfo - #endif - ret = setup_rt_frame(sig, ka, info, oldset, regs); - -- if (ret) { -+ if (ret == 0) { - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) -@@ -428,9 +397,10 @@ handle_signal(unsigned long sig, siginfo - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ --int do_signal(struct pt_regs *regs, sigset_t *oldset) -+static void do_signal(struct pt_regs *regs) - { - struct k_sigaction ka; -+ sigset_t *oldset; - siginfo_t info; - int signr; - -@@ -441,12 +411,14 @@ int do_signal(struct pt_regs *regs, sigs - * if so. - */ - if (!user_mode(regs)) -- return 1; -+ return; - -- if (try_to_freeze()) -+ if (try_to_freeze() && !signal_pending(current)) - goto no_signal; - -- if (!oldset) -+ if (test_thread_flag(TIF_RESTORE_SIGMASK)) -+ oldset = ¤t->saved_sigmask; -+ else - oldset = ¤t->blocked; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); -@@ -460,7 +432,15 @@ int do_signal(struct pt_regs *regs, sigs - set_debugreg(current->thread.debugreg7, 7); - - /* Whee! Actually deliver the signal. */ -- return handle_signal(signr, &info, &ka, oldset, regs); -+ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { -+ /* a signal was successfully delivered; the saved -+ * sigmask will have been stored in the signal frame, -+ * and will be restored by sigreturn, so we can simply -+ * clear the TIF_RESTORE_SIGMASK flag */ -+ if (test_thread_flag(TIF_RESTORE_SIGMASK)) -+ clear_thread_flag(TIF_RESTORE_SIGMASK); -+ } -+ return; - } - - no_signal: -@@ -481,10 +461,16 @@ int do_signal(struct pt_regs *regs, sigs - regs->rip -= 2; - } - } -- return 0; -+ -+ /* if there's no signal to deliver, we just put the saved sigmask -+ * back */ -+ if (test_thread_flag(TIF_RESTORE_SIGMASK)) { -+ clear_thread_flag(TIF_RESTORE_SIGMASK); -+ sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); -+ } - } - --void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) -+void do_notify_resume(struct pt_regs *regs, sigset_t *unused, __u32 thread_info_flags) - { - #ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", -@@ -498,8 +484,8 @@ void do_notify_resume(struct pt_regs *re - } - - /* deal with pending signal delivery */ -- if (thread_info_flags & _TIF_SIGPENDING) -- do_signal(regs,oldset); -+ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) -+ do_signal(regs); - } - - void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/smp.c linux-2.6.16-026test009/arch/x86_64/kernel/smp.c ---- linux-2.6.16.orig/arch/x86_64/kernel/smp.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/smp.c 2006-04-19 15:02:11.000000000 +0400 -@@ -28,6 +28,7 @@ - #include <asm/proto.h> - #include <asm/apicdef.h> - #include <asm/idle.h> -+#include <asm/nmi.h> - - /* - * Smarter SMP flushing macros. -@@ -444,6 +445,84 @@ int smp_call_function (void (*func) (voi - return 0; - } - -+static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED; -+static struct nmi_call_data_struct { -+ smp_nmi_function func; -+ void *info; -+ atomic_t started; -+ atomic_t finished; -+ cpumask_t cpus_called; -+ int wait; -+} *nmi_call_data; -+ -+static int smp_nmi_callback(struct pt_regs * regs, int cpu) -+{ -+ smp_nmi_function func; -+ void *info; -+ int wait; -+ -+ func = nmi_call_data->func; -+ info = nmi_call_data->info; -+ wait = nmi_call_data->wait; -+ ack_APIC_irq(); -+ /* prevent from calling func() multiple times */ -+ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) -+ return 0; -+ /* -+ * notify initiating CPU that I've grabbed the data and am -+ * about to execute the function -+ */ -+ mb(); -+ atomic_inc(&nmi_call_data->started); -+ /* at this point the nmi_call_data structure is out of scope */ -+ irq_enter(); -+ func(regs, info); -+ irq_exit(); -+ if (wait) -+ atomic_inc(&nmi_call_data->finished); -+ -+ return 0; -+} -+ -+int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) -+{ -+ struct nmi_call_data_struct data; -+ int cpus; -+ -+ cpus = num_online_cpus() - 1; -+ if (!cpus) -+ return 0; -+ -+ data.func = func; -+ data.info = info; -+ data.wait = wait; -+ atomic_set(&data.started, 0); -+ atomic_set(&data.finished, 0); -+ cpus_clear(data.cpus_called); -+ /* prevent this cpu from calling func if NMI happens */ -+ cpu_set(smp_processor_id(), data.cpus_called); -+ -+ if (!spin_trylock(&nmi_call_lock)) -+ return -1; -+ -+ nmi_call_data = &data; -+ set_nmi_ipi_callback(smp_nmi_callback); -+ mb(); -+ -+ /* Send a message to all other CPUs and wait for them to respond */ -+ send_IPI_allbutself(APIC_DM_NMI); -+ while (atomic_read(&data.started) != cpus) -+ barrier(); -+ -+ unset_nmi_ipi_callback(); -+ if (wait) -+ while (atomic_read(&data.finished) != cpus) -+ barrier(); -+ spin_unlock(&nmi_call_lock); -+ -+ return 0; -+} -+ - void smp_stop_cpu(void) - { - unsigned long flags; -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/sys_x86_64.c linux-2.6.16-026test009/arch/x86_64/kernel/sys_x86_64.c ---- linux-2.6.16.orig/arch/x86_64/kernel/sys_x86_64.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/sys_x86_64.c 2006-04-19 15:02:12.000000000 +0400 -@@ -148,7 +148,7 @@ asmlinkage long sys_uname(struct new_uts - { - int err; - down_read(&uts_sem); -- err = copy_to_user(name, &system_utsname, sizeof (*name)); -+ err = copy_to_user(name, &ve_utsname, sizeof (*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/time.c linux-2.6.16-026test009/arch/x86_64/kernel/time.c ---- linux-2.6.16.orig/arch/x86_64/kernel/time.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/time.c 2006-04-19 15:02:12.000000000 +0400 -@@ -66,6 +66,8 @@ unsigned long vxtime_hz = PIT_TICK_RATE; - int report_lost_ticks; /* command line option */ - unsigned long long monotonic_base; - -+EXPORT_SYMBOL(cpu_khz); -+ - struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ - - volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -diff -upr linux-2.6.16.orig/arch/x86_64/kernel/traps.c linux-2.6.16-026test009/arch/x86_64/kernel/traps.c ---- linux-2.6.16.orig/arch/x86_64/kernel/traps.c 2006-04-19 15:02:00.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/kernel/traps.c 2006-04-19 15:02:12.000000000 +0400 -@@ -116,6 +116,9 @@ int printk_address(unsigned long address - char *delim = ":"; - char namebuf[128]; - -+ if (!decode_call_traces) -+ return printk("[<%016lx>]", address); -+ - symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); - if (!symname) - return printk("[<%016lx>]", address); -@@ -208,7 +211,7 @@ void show_trace(unsigned long *stack) - do while (cond) { \ - unsigned long addr = *stack++; \ - if (kernel_text_address(addr)) { \ -- if (i > 50) { \ -+ if (i > 50 && decode_call_traces) { \ - printk("\n "); \ - i = 0; \ - } \ -@@ -319,10 +322,12 @@ void show_registers(struct pt_regs *regs - - rsp = regs->rsp; - -- printk("CPU %d ", cpu); -+ printk("CPU: %d ", cpu); - __show_regs(regs); -- printk("Process %s (pid: %d, threadinfo %p, task %p)\n", -- cur->comm, cur->pid, task_thread_info(cur), cur); -+ printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n", -+ cur->comm, cur->pid, -+ VEID(VE_TASK_INFO(current)->owner_env), -+ task_thread_info(cur), cur); - - /* - * When in-kernel, we also print out the stack and code at the -@@ -458,6 +463,7 @@ void __kprobes die_nmi(char *str, struct - show_registers(regs); - if (panic_on_timeout || panic_on_oops) - panic("nmi watchdog"); -+ smp_nmi_call_function(smp_show_regs, NULL, 1); - printk("console shuts up ...\n"); - oops_end(flags); - do_exit(SIGSEGV); -diff -upr linux-2.6.16.orig/arch/x86_64/mm/fault.c linux-2.6.16-026test009/arch/x86_64/mm/fault.c ---- linux-2.6.16.orig/arch/x86_64/mm/fault.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/mm/fault.c 2006-04-19 15:02:12.000000000 +0400 -@@ -41,27 +41,6 @@ - #define PF_RSVD (1<<3) - #define PF_INSTR (1<<4) - --void bust_spinlocks(int yes) --{ -- int loglevel_save = console_loglevel; -- if (yes) { -- oops_in_progress = 1; -- } else { --#ifdef CONFIG_VT -- unblank_screen(); --#endif -- oops_in_progress = 0; -- /* -- * OK, the message is on the console. Now we call printk() -- * without oops_in_progress set so that printk will give klogd -- * a poke. Hold onto your hats... -- */ -- console_loglevel = 15; /* NMI oopser may have shut the console up */ -- printk(" "); -- console_loglevel = loglevel_save; -- } --} -- - /* Sometimes the CPU reports invalid exceptions on prefetch. - Check that here and ignore. - Opcode checker based on code by Richard Brunner */ -@@ -293,7 +272,7 @@ static int vmalloc_fault(unsigned long a - } - - int page_fault_trace = 0; --int exception_trace = 1; -+int exception_trace = 0; - - /* - * This routine handles page faults. It determines the address, -@@ -322,7 +301,7 @@ asmlinkage void __kprobes do_page_fault( - local_irq_enable(); - - if (unlikely(page_fault_trace)) -- printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", -+ ve_printk(VE_LOG, "pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", - regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); - - tsk = current; -@@ -372,7 +351,6 @@ asmlinkage void __kprobes do_page_fault( - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; - -- again: - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an -@@ -476,7 +454,7 @@ bad_area_nosemaphore: - return; - - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { -- printk( -+ ve_printk(VE_LOG, - "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->rip, -@@ -544,13 +522,14 @@ no_context: - */ - out_of_memory: - up_read(&mm->mmap_sem); -- if (current->pid == 1) { -- yield(); -- goto again; -+ if (error_code & 4) { -+ /* -+ * 0-order allocation always success if something really -+ * fatal not happen: beancounter overdraft or OOM. -+ */ -+ force_sig(SIGKILL, tsk); -+ return; - } -- printk("VM: killing process %s\n", tsk->comm); -- if (error_code & 4) -- do_exit(SIGKILL); - goto no_context; - - do_sigbus: -diff -upr linux-2.6.16.orig/arch/x86_64/mm/init.c linux-2.6.16-026test009/arch/x86_64/mm/init.c ---- linux-2.6.16.orig/arch/x86_64/mm/init.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/arch/x86_64/mm/init.c 2006-04-19 15:02:12.000000000 +0400 -@@ -89,6 +89,7 @@ void show_mem(void) - printk(KERN_INFO "%lu pages shared\n",shared); - printk(KERN_INFO "%lu pages swap cached\n",cached); - } -+EXPORT_SYMBOL(show_mem); - - /* References to section boundaries */ - -diff -upr linux-2.6.16.orig/block/elevator.c linux-2.6.16-026test009/block/elevator.c ---- linux-2.6.16.orig/block/elevator.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/block/elevator.c 2006-04-19 15:02:12.000000000 +0400 -@@ -676,7 +676,7 @@ void elv_unregister(struct elevator_type - * Iterate every thread in the process to remove the io contexts. - */ - read_lock(&tasklist_lock); -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - struct io_context *ioc = p->io_context; - if (ioc && ioc->cic) { - ioc->cic->exit(ioc->cic); -@@ -688,7 +688,7 @@ void elv_unregister(struct elevator_type - ioc->aic->dtor(ioc->aic); - ioc->aic = NULL; - } -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - read_unlock(&tasklist_lock); - - spin_lock_irq(&elv_list_lock); -diff -upr linux-2.6.16.orig/block/genhd.c linux-2.6.16-026test009/block/genhd.c ---- linux-2.6.16.orig/block/genhd.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/block/genhd.c 2006-04-19 15:02:12.000000000 +0400 -@@ -18,7 +18,8 @@ - - #define MAX_PROBE_HASH 255 /* random */ - --static struct subsystem block_subsys; -+struct subsystem block_subsys; -+EXPORT_SYMBOL(block_subsys); - - static DECLARE_MUTEX(block_subsys_sem); - -@@ -592,7 +593,7 @@ static struct kset_uevent_ops block_ueve - }; - - /* declare block_subsys. */ --static decl_subsys(block, &ktype_block, &block_uevent_ops); -+decl_subsys(block, &ktype_block, &block_uevent_ops); - - - /* -diff -upr linux-2.6.16.orig/drivers/base/class.c linux-2.6.16-026test009/drivers/base/class.c ---- linux-2.6.16.orig/drivers/base/class.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/base/class.c 2006-04-19 15:02:12.000000000 +0400 -@@ -72,8 +72,13 @@ static struct kobj_type ktype_class = { - }; - - /* Hotplug events for classes go to the class_obj subsys */ --static decl_subsys(class, &ktype_class, NULL); -+decl_subsys(class, &ktype_class, NULL); - -+#ifndef CONFIG_VE -+#define visible_class_subsys class_subsys -+#else -+#define visible_class_subsys (*get_exec_env()->class_subsys) -+#endif - - int class_create_file(struct class * cls, const struct class_attribute * attr) - { -@@ -148,7 +153,7 @@ int class_register(struct class * cls) - if (error) - return error; - -- subsys_set_kset(cls, class_subsys); -+ subsys_set_kset(cls, visible_class_subsys); - - error = subsystem_register(&cls->subsys); - if (!error) { -@@ -420,8 +425,13 @@ static struct kset_uevent_ops class_ueve - .uevent = class_uevent, - }; - --static decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops); -+decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops); - -+#ifndef CONFIG_VE -+#define visible_class_obj_subsys class_obj_subsys -+#else -+#define visible_class_obj_subsys (*get_exec_env()->class_obj_subsys) -+#endif - - static int class_device_add_attrs(struct class_device * cd) - { -@@ -470,7 +480,7 @@ static ssize_t store_uevent(struct class - - void class_device_initialize(struct class_device *class_dev) - { -- kobj_set_kset_s(class_dev, class_obj_subsys); -+ kobj_set_kset_s(class_dev, visible_class_obj_subsys); - kobject_init(&class_dev->kobj); - INIT_LIST_HEAD(&class_dev->node); - } -@@ -805,12 +815,19 @@ void class_interface_unregister(struct c - class_put(parent); - } - -- -+void prepare_sysfs_classes(void) -+{ -+#ifdef CONFIG_VE -+ get_ve0()->class_subsys = &class_subsys; -+ get_ve0()->class_obj_subsys = &class_obj_subsys; -+#endif -+} - - int __init classes_init(void) - { - int retval; - -+ prepare_sysfs_classes(); - retval = subsystem_register(&class_subsys); - if (retval) - return retval; -@@ -848,3 +865,6 @@ EXPORT_SYMBOL_GPL(class_device_remove_bi - - EXPORT_SYMBOL_GPL(class_interface_register); - EXPORT_SYMBOL_GPL(class_interface_unregister); -+ -+EXPORT_SYMBOL(class_subsys); -+EXPORT_SYMBOL(class_obj_subsys); -diff -upr linux-2.6.16.orig/drivers/base/cpu.c linux-2.6.16-026test009/drivers/base/cpu.c ---- linux-2.6.16.orig/drivers/base/cpu.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/base/cpu.c 2006-04-19 15:02:11.000000000 +0400 -@@ -141,7 +141,7 @@ int __devinit register_cpu(struct cpu *c - return error; - } - --struct sys_device *get_cpu_sysdev(int cpu) -+struct sys_device *get_cpu_sysdev(unsigned cpu) - { - if (cpu < NR_CPUS) - return cpu_sys_devices[cpu]; -diff -upr linux-2.6.16.orig/drivers/base/firmware_class.c linux-2.6.16-026test009/drivers/base/firmware_class.c ---- linux-2.6.16.orig/drivers/base/firmware_class.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/base/firmware_class.c 2006-04-19 15:02:11.000000000 +0400 -@@ -211,18 +211,20 @@ static int - fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size) - { - u8 *new_data; -+ int new_size = fw_priv->alloc_size; - - if (min_size <= fw_priv->alloc_size) - return 0; - -- new_data = vmalloc(fw_priv->alloc_size + PAGE_SIZE); -+ new_size = ALIGN(min_size, PAGE_SIZE); -+ new_data = vmalloc(new_size); - if (!new_data) { - printk(KERN_ERR "%s: unable to alloc buffer\n", __FUNCTION__); - /* Make sure that we don't keep incomplete data */ - fw_load_abort(fw_priv); - return -ENOMEM; - } -- fw_priv->alloc_size += PAGE_SIZE; -+ fw_priv->alloc_size = new_size; - if (fw_priv->fw->data) { - memcpy(new_data, fw_priv->fw->data, fw_priv->fw->size); - vfree(fw_priv->fw->data); -diff -upr linux-2.6.16.orig/drivers/base/node.c linux-2.6.16-026test009/drivers/base/node.c ---- linux-2.6.16.orig/drivers/base/node.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/base/node.c 2006-04-19 15:02:11.000000000 +0400 -@@ -106,7 +106,7 @@ static ssize_t node_read_numastat(struct - other_node = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *z = &pg->node_zones[i]; -- for (cpu = 0; cpu < NR_CPUS; cpu++) { -+ for_each_online_cpu(cpu) { - struct per_cpu_pageset *ps = zone_pcp(z,cpu); - numa_hit += ps->numa_hit; - numa_miss += ps->numa_miss; -diff -upr linux-2.6.16.orig/drivers/block/cciss.c linux-2.6.16-026test009/drivers/block/cciss.c ---- linux-2.6.16.orig/drivers/block/cciss.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/block/cciss.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1181,6 +1181,53 @@ static int revalidate_allvol(ctlr_info_t - return 0; - } - -+static inline void complete_buffers(struct bio *bio, int status) -+{ -+ while (bio) { -+ struct bio *xbh = bio->bi_next; -+ int nr_sectors = bio_sectors(bio); -+ -+ bio->bi_next = NULL; -+ blk_finished_io(len); -+ bio_endio(bio, nr_sectors << 9, status ? 0 : -EIO); -+ bio = xbh; -+ } -+ -+} -+ -+static void cciss_softirq_done(struct request *rq) -+{ -+ CommandList_struct *cmd = rq->completion_data; -+ ctlr_info_t *h = hba[cmd->ctlr]; -+ unsigned long flags; -+ u64bit temp64; -+ int i, ddir; -+ -+ if (cmd->Request.Type.Direction == XFER_READ) -+ ddir = PCI_DMA_FROMDEVICE; -+ else -+ ddir = PCI_DMA_TODEVICE; -+ -+ /* command did not need to be retried */ -+ /* unmap the DMA mapping for all the scatter gather elements */ -+ for(i=0; i<cmd->Header.SGList; i++) { -+ temp64.val32.lower = cmd->SG[i].Addr.lower; -+ temp64.val32.upper = cmd->SG[i].Addr.upper; -+ pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); -+ } -+ -+ complete_buffers(rq->bio, rq->errors); -+ -+#ifdef CCISS_DEBUG -+ printk("Done with %p\n", rq); -+#endif /* CCISS_DEBUG */ -+ -+ spin_lock_irqsave(&h->lock, flags); -+ end_that_request_last(rq, rq->errors); -+ cmd_free(h, cmd,1); -+ spin_unlock_irqrestore(&h->lock, flags); -+} -+ - /* This function will check the usage_count of the drive to be updated/added. - * If the usage_count is zero then the drive information will be updated and - * the disk will be re-registered with the kernel. If not then it will be -@@ -1249,6 +1296,8 @@ static void cciss_update_drive_info(int - - blk_queue_max_sectors(disk->queue, 512); - -+ blk_queue_softirq_done(disk->queue, cciss_softirq_done); -+ - disk->queue->queuedata = hba[ctlr]; - - blk_queue_hardsect_size(disk->queue, -@@ -2148,20 +2197,6 @@ static void start_io( ctlr_info_t *h) - addQ (&(h->cmpQ), c); - } - } -- --static inline void complete_buffers(struct bio *bio, int status) --{ -- while (bio) { -- struct bio *xbh = bio->bi_next; -- int nr_sectors = bio_sectors(bio); -- -- bio->bi_next = NULL; -- blk_finished_io(len); -- bio_endio(bio, nr_sectors << 9, status ? 0 : -EIO); -- bio = xbh; -- } -- --} - /* Assumes that CCISS_LOCK(h->ctlr) is held. */ - /* Zeros out the error record and then resends the command back */ - /* to the controller */ -@@ -2179,39 +2214,6 @@ static inline void resend_cciss_cmd( ctl - start_io(h); - } - --static void cciss_softirq_done(struct request *rq) --{ -- CommandList_struct *cmd = rq->completion_data; -- ctlr_info_t *h = hba[cmd->ctlr]; -- unsigned long flags; -- u64bit temp64; -- int i, ddir; -- -- if (cmd->Request.Type.Direction == XFER_READ) -- ddir = PCI_DMA_FROMDEVICE; -- else -- ddir = PCI_DMA_TODEVICE; -- -- /* command did not need to be retried */ -- /* unmap the DMA mapping for all the scatter gather elements */ -- for(i=0; i<cmd->Header.SGList; i++) { -- temp64.val32.lower = cmd->SG[i].Addr.lower; -- temp64.val32.upper = cmd->SG[i].Addr.upper; -- pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); -- } -- -- complete_buffers(rq->bio, rq->errors); -- --#ifdef CCISS_DEBUG -- printk("Done with %p\n", rq); --#endif /* CCISS_DEBUG */ -- -- spin_lock_irqsave(&h->lock, flags); -- end_that_request_last(rq, rq->errors); -- cmd_free(h, cmd,1); -- spin_unlock_irqrestore(&h->lock, flags); --} -- - /* checks the status of the job and calls complete buffers to mark all - * buffers for the completed job. Note that this function does not need - * to hold the hba/queue lock. -@@ -3269,8 +3271,8 @@ clean2: - unregister_blkdev(hba[i]->major, hba[i]->devname); - clean1: - release_io_mem(hba[i]); -- free_hba(i); - hba[i]->busy_initializing = 0; -+ free_hba(i); - return(-1); - } - -diff -upr linux-2.6.16.orig/drivers/char/Kconfig linux-2.6.16-026test009/drivers/char/Kconfig ---- linux-2.6.16.orig/drivers/char/Kconfig 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/char/Kconfig 2006-04-19 15:02:11.000000000 +0400 -@@ -187,6 +187,7 @@ config MOXA_SMARTIO - config ISI - tristate "Multi-Tech multiport card support (EXPERIMENTAL)" - depends on SERIAL_NONSTANDARD -+ select FW_LOADER - help - This is a driver for the Multi-Tech cards which provide several - serial ports. The driver is experimental and can currently only be -diff -upr linux-2.6.16.orig/drivers/char/pty.c linux-2.6.16-026test009/drivers/char/pty.c ---- linux-2.6.16.orig/drivers/char/pty.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/char/pty.c 2006-04-19 15:02:12.000000000 +0400 -@@ -32,16 +32,30 @@ - #include <linux/bitops.h> - #include <linux/devpts_fs.h> - -+#include <ub/ub_misc.h> -+ - /* These are global because they are accessed in tty_io.c */ - #ifdef CONFIG_UNIX98_PTYS - struct tty_driver *ptm_driver; --static struct tty_driver *pts_driver; -+struct tty_driver *pts_driver; -+EXPORT_SYMBOL(ptm_driver); -+EXPORT_SYMBOL(pts_driver); -+ -+void prepare_pty(void) -+{ -+#ifdef CONFIG_VE -+ get_ve0()->ptm_driver = ptm_driver; -+ /* don't clean ptm_driver and co. here, they are used in vecalls.c */ -+#endif -+} - #endif - - static void pty_close(struct tty_struct * tty, struct file * filp) - { - if (!tty) - return; -+ -+ ub_pty_uncharge(tty); - if (tty->driver->subtype == PTY_TYPE_MASTER) { - if (tty->count > 1) - printk("master pty_close: count = %d!!\n", tty->count); -@@ -61,8 +75,12 @@ static void pty_close(struct tty_struct - if (tty->driver->subtype == PTY_TYPE_MASTER) { - set_bit(TTY_OTHER_CLOSED, &tty->flags); - #ifdef CONFIG_UNIX98_PTYS -- if (tty->driver == ptm_driver) -+ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) { -+ struct ve_struct *old_env; -+ old_env = set_exec_env(VE_OWNER_TTY(tty)); - devpts_pty_kill(tty->index); -+ (void)set_exec_env(old_env); -+ } - #endif - tty_vhangup(tty->link); - } -@@ -212,6 +230,10 @@ static int pty_open(struct tty_struct *t - if (tty->link->count != 1) - goto out; - -+ retval = -ENODEV; -+ if (ub_pty_charge(tty)) -+ goto out; -+ - clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); - set_bit(TTY_THROTTLED, &tty->flags); - set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); -@@ -239,7 +261,9 @@ static struct tty_operations pty_ops = { - - /* Traditional BSD devices */ - #ifdef CONFIG_LEGACY_PTYS --static struct tty_driver *pty_driver, *pty_slave_driver; -+struct tty_driver *pty_driver, *pty_slave_driver; -+EXPORT_SYMBOL(pty_driver); -+EXPORT_SYMBOL(pty_slave_driver); - - static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file, - unsigned int cmd, unsigned long arg) -@@ -397,6 +421,7 @@ static void __init unix98_pty_init(void) - panic("Couldn't register Unix98 pts driver"); - - pty_table[1].data = &ptm_driver->refcount; -+ prepare_pty(); - } - #else - static inline void unix98_pty_init(void) { } -diff -upr linux-2.6.16.orig/drivers/char/snsc_event.c linux-2.6.16-026test009/drivers/char/snsc_event.c ---- linux-2.6.16.orig/drivers/char/snsc_event.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/char/snsc_event.c 2006-04-19 15:02:12.000000000 +0400 -@@ -206,7 +206,7 @@ scdrv_dispatch_event(char *event, int le - - /* first find init's task */ - read_lock(&tasklist_lock); -- for_each_process(p) { -+ for_each_process_all(p) { - if (p->pid == 1) - break; - } -diff -upr linux-2.6.16.orig/drivers/char/sysrq.c linux-2.6.16-026test009/drivers/char/sysrq.c ---- linux-2.6.16.orig/drivers/char/sysrq.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/char/sysrq.c 2006-04-19 15:02:12.000000000 +0400 -@@ -174,8 +174,13 @@ static struct sysrq_key_op sysrq_showloc - static void sysrq_handle_showregs(int key, struct pt_regs *pt_regs, - struct tty_struct *tty) - { -+ bust_spinlocks(1); - if (pt_regs) - show_regs(pt_regs); -+ bust_spinlocks(0); -+#if defined(__i386__) || defined(__x86_64__) -+ smp_nmi_call_function(smp_show_regs, NULL, 0); -+#endif - } - static struct sysrq_key_op sysrq_showregs_op = { - .handler = sysrq_handle_showregs, -@@ -221,7 +226,7 @@ static void send_sig_all(int sig) - { - struct task_struct *p; - -- for_each_process(p) { -+ for_each_process_all(p) { - if (p->mm && p->pid != 1) - /* Not swapper, init nor kernel thread */ - force_sig(sig, p); -diff -upr linux-2.6.16.orig/drivers/char/tlclk.c linux-2.6.16-026test009/drivers/char/tlclk.c ---- linux-2.6.16.orig/drivers/char/tlclk.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/char/tlclk.c 2006-04-19 15:02:11.000000000 +0400 -@@ -327,7 +327,7 @@ static ssize_t store_received_ref_clk3a( - return strnlen(buf, count); - } - --static DEVICE_ATTR(received_ref_clk3a, S_IWUGO, NULL, -+static DEVICE_ATTR(received_ref_clk3a, (S_IWUSR|S_IWGRP), NULL, - store_received_ref_clk3a); - - -@@ -349,7 +349,7 @@ static ssize_t store_received_ref_clk3b( - return strnlen(buf, count); - } - --static DEVICE_ATTR(received_ref_clk3b, S_IWUGO, NULL, -+static DEVICE_ATTR(received_ref_clk3b, (S_IWUSR|S_IWGRP), NULL, - store_received_ref_clk3b); - - -@@ -371,7 +371,7 @@ static ssize_t store_enable_clk3b_output - return strnlen(buf, count); - } - --static DEVICE_ATTR(enable_clk3b_output, S_IWUGO, NULL, -+static DEVICE_ATTR(enable_clk3b_output, (S_IWUSR|S_IWGRP), NULL, - store_enable_clk3b_output); - - static ssize_t store_enable_clk3a_output(struct device *d, -@@ -392,7 +392,7 @@ static ssize_t store_enable_clk3a_output - return strnlen(buf, count); - } - --static DEVICE_ATTR(enable_clk3a_output, S_IWUGO, NULL, -+static DEVICE_ATTR(enable_clk3a_output, (S_IWUSR|S_IWGRP), NULL, - store_enable_clk3a_output); - - static ssize_t store_enable_clkb1_output(struct device *d, -@@ -413,7 +413,7 @@ static ssize_t store_enable_clkb1_output - return strnlen(buf, count); - } - --static DEVICE_ATTR(enable_clkb1_output, S_IWUGO, NULL, -+static DEVICE_ATTR(enable_clkb1_output, (S_IWUSR|S_IWGRP), NULL, - store_enable_clkb1_output); - - -@@ -435,7 +435,7 @@ static ssize_t store_enable_clka1_output - return strnlen(buf, count); - } - --static DEVICE_ATTR(enable_clka1_output, S_IWUGO, NULL, -+static DEVICE_ATTR(enable_clka1_output, (S_IWUSR|S_IWGRP), NULL, - store_enable_clka1_output); - - static ssize_t store_enable_clkb0_output(struct device *d, -@@ -456,7 +456,7 @@ static ssize_t store_enable_clkb0_output - return strnlen(buf, count); - } - --static DEVICE_ATTR(enable_clkb0_output, S_IWUGO, NULL, -+static DEVICE_ATTR(enable_clkb0_output, (S_IWUSR|S_IWGRP), NULL, - store_enable_clkb0_output); - - static ssize_t store_enable_clka0_output(struct device *d, -@@ -477,7 +477,7 @@ static ssize_t store_enable_clka0_output - return strnlen(buf, count); - } - --static DEVICE_ATTR(enable_clka0_output, S_IWUGO, NULL, -+static DEVICE_ATTR(enable_clka0_output, (S_IWUSR|S_IWGRP), NULL, - store_enable_clka0_output); - - static ssize_t store_select_amcb2_transmit_clock(struct device *d, -@@ -519,7 +519,7 @@ static ssize_t store_select_amcb2_transm - return strnlen(buf, count); - } - --static DEVICE_ATTR(select_amcb2_transmit_clock, S_IWUGO, NULL, -+static DEVICE_ATTR(select_amcb2_transmit_clock, (S_IWUSR|S_IWGRP), NULL, - store_select_amcb2_transmit_clock); - - static ssize_t store_select_amcb1_transmit_clock(struct device *d, -@@ -560,7 +560,7 @@ static ssize_t store_select_amcb1_transm - return strnlen(buf, count); - } - --static DEVICE_ATTR(select_amcb1_transmit_clock, S_IWUGO, NULL, -+static DEVICE_ATTR(select_amcb1_transmit_clock, (S_IWUSR|S_IWGRP), NULL, - store_select_amcb1_transmit_clock); - - static ssize_t store_select_redundant_clock(struct device *d, -@@ -581,7 +581,7 @@ static ssize_t store_select_redundant_cl - return strnlen(buf, count); - } - --static DEVICE_ATTR(select_redundant_clock, S_IWUGO, NULL, -+static DEVICE_ATTR(select_redundant_clock, (S_IWUSR|S_IWGRP), NULL, - store_select_redundant_clock); - - static ssize_t store_select_ref_frequency(struct device *d, -@@ -602,7 +602,7 @@ static ssize_t store_select_ref_frequenc - return strnlen(buf, count); - } - --static DEVICE_ATTR(select_ref_frequency, S_IWUGO, NULL, -+static DEVICE_ATTR(select_ref_frequency, (S_IWUSR|S_IWGRP), NULL, - store_select_ref_frequency); - - static ssize_t store_filter_select(struct device *d, -@@ -623,7 +623,7 @@ static ssize_t store_filter_select(struc - return strnlen(buf, count); - } - --static DEVICE_ATTR(filter_select, S_IWUGO, NULL, store_filter_select); -+static DEVICE_ATTR(filter_select, (S_IWUSR|S_IWGRP), NULL, store_filter_select); - - static ssize_t store_hardware_switching_mode(struct device *d, - struct device_attribute *attr, const char *buf, size_t count) -@@ -643,7 +643,7 @@ static ssize_t store_hardware_switching_ - return strnlen(buf, count); - } - --static DEVICE_ATTR(hardware_switching_mode, S_IWUGO, NULL, -+static DEVICE_ATTR(hardware_switching_mode, (S_IWUSR|S_IWGRP), NULL, - store_hardware_switching_mode); - - static ssize_t store_hardware_switching(struct device *d, -@@ -664,7 +664,7 @@ static ssize_t store_hardware_switching( - return strnlen(buf, count); - } - --static DEVICE_ATTR(hardware_switching, S_IWUGO, NULL, -+static DEVICE_ATTR(hardware_switching, (S_IWUSR|S_IWGRP), NULL, - store_hardware_switching); - - static ssize_t store_refalign (struct device *d, -@@ -684,7 +684,7 @@ static ssize_t store_refalign (struct de - return strnlen(buf, count); - } - --static DEVICE_ATTR(refalign, S_IWUGO, NULL, store_refalign); -+static DEVICE_ATTR(refalign, (S_IWUSR|S_IWGRP), NULL, store_refalign); - - static ssize_t store_mode_select (struct device *d, - struct device_attribute *attr, const char *buf, size_t count) -@@ -704,7 +704,7 @@ static ssize_t store_mode_select (struct - return strnlen(buf, count); - } - --static DEVICE_ATTR(mode_select, S_IWUGO, NULL, store_mode_select); -+static DEVICE_ATTR(mode_select, (S_IWUSR|S_IWGRP), NULL, store_mode_select); - - static ssize_t store_reset (struct device *d, - struct device_attribute *attr, const char *buf, size_t count) -@@ -724,7 +724,7 @@ static ssize_t store_reset (struct devic - return strnlen(buf, count); - } - --static DEVICE_ATTR(reset, S_IWUGO, NULL, store_reset); -+static DEVICE_ATTR(reset, (S_IWUSR|S_IWGRP), NULL, store_reset); - - static struct attribute *tlclk_sysfs_entries[] = { - &dev_attr_current_ref.attr, -@@ -767,6 +767,7 @@ static int __init tlclk_init(void) - printk(KERN_ERR "tlclk: can't get major %d.\n", tlclk_major); - return ret; - } -+ tlclk_major = ret; - alarm_events = kzalloc( sizeof(struct tlclk_alarms), GFP_KERNEL); - if (!alarm_events) - goto out1; -diff -upr linux-2.6.16.orig/drivers/char/tty_io.c linux-2.6.16-026test009/drivers/char/tty_io.c ---- linux-2.6.16.orig/drivers/char/tty_io.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/char/tty_io.c 2006-04-19 15:02:12.000000000 +0400 -@@ -86,6 +86,7 @@ - #include <linux/string.h> - #include <linux/slab.h> - #include <linux/poll.h> -+#include <linux/ve_owner.h> - #include <linux/proc_fs.h> - #include <linux/init.h> - #include <linux/module.h> -@@ -105,6 +106,7 @@ - #include <linux/devfs_fs_kernel.h> - - #include <linux/kmod.h> -+#include <ub/ub_mem.h> - - #undef TTY_DEBUG_HANGUP - -@@ -122,11 +124,16 @@ struct termios tty_std_termios = { /* fo - - EXPORT_SYMBOL(tty_std_termios); - -+/* this lock protects tty_drivers list, this pretty guys do no locking */ -+rwlock_t tty_driver_guard = RW_LOCK_UNLOCKED; -+EXPORT_SYMBOL(tty_driver_guard); -+ - /* This list gets poked at by procfs and various bits of boot up code. This - could do with some rationalisation such as pulling the tty proc function - into this file */ - - LIST_HEAD(tty_drivers); /* linked list of tty drivers */ -+EXPORT_SYMBOL(tty_drivers); - - /* Semaphore to protect creating and releasing a tty. This is shared with - vt.c for deeply disgusting hack reasons */ -@@ -136,6 +143,15 @@ DECLARE_MUTEX(tty_sem); - extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ - extern int pty_limit; /* Config limit on Unix98 ptys */ - static DEFINE_IDR(allocated_ptys); -+#ifdef CONFIG_VE -+#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys)) -+#define ve_allocated_ptys __ve_allocated_ptys(get_exec_env()) -+#define ve_ptm_driver (get_exec_env()->ptm_driver) -+#else -+#define __ve_allocated_ptys(ve) allocated_ptys -+#define ve_allocated_ptys allocated_ptys -+#define ve_ptm_driver ptm_driver -+#endif - static DECLARE_MUTEX(allocated_ptys_lock); - static int ptmx_open(struct inode *, struct file *); - #endif -@@ -156,11 +172,25 @@ static int tty_fasync(int fd, struct fil - static void release_mem(struct tty_struct *tty, int idx); - - -+DCL_VE_OWNER(TTYDRV, struct tty_driver, owner_env) -+DCL_VE_OWNER(TTY, struct tty_struct, owner_env) -+ -+void prepare_tty(void) -+{ -+#ifdef CONFIG_VE -+ get_ve0()->allocated_ptys = &allocated_ptys; -+ /* -+ * in this case, tty_register_driver() setups -+ * owner_env correctly right from the bootup -+ */ -+#endif -+} -+ - static struct tty_struct *alloc_tty_struct(void) - { - struct tty_struct *tty; - -- tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL); -+ tty = ub_kmalloc(sizeof(struct tty_struct), GFP_KERNEL); - if (tty) - memset(tty, 0, sizeof(struct tty_struct)); - return tty; -@@ -857,14 +887,37 @@ static struct tty_driver *get_tty_driver - { - struct tty_driver *p; - -+ read_lock(&tty_driver_guard); - list_for_each_entry(p, &tty_drivers, tty_drivers) { - dev_t base = MKDEV(p->major, p->minor_start); - if (device < base || device >= base + p->num) - continue; - *index = device - base; -- return p; -+#ifdef CONFIG_VE -+ if (in_interrupt()) -+ goto found; -+ if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR -+#ifdef CONFIG_UNIX98_PTYS -+ && (p->major<UNIX98_PTY_MASTER_MAJOR || -+ p->major>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) && -+ (p->major<UNIX98_PTY_SLAVE_MAJOR || -+ p->major>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) -+#endif -+ ) goto found; -+ if (ve_is_super(VE_OWNER_TTYDRV(p)) && -+ ve_is_super(get_exec_env())) -+ goto found; -+ if (!ve_accessible_strict(VE_OWNER_TTYDRV(p), get_exec_env())) -+ continue; -+#endif -+ goto found; - } -+ read_unlock(&tty_driver_guard); - return NULL; -+ -+found: -+ read_unlock(&tty_driver_guard); -+ return p; - } - - /* -@@ -1092,7 +1145,7 @@ static void do_tty_hangup(void *data) - - read_lock(&tasklist_lock); - if (tty->session > 0) { -- do_each_task_pid(tty->session, PIDTYPE_SID, p) { -+ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { - if (p->signal->tty == tty) - p->signal->tty = NULL; - if (!p->signal->leader) -@@ -1101,7 +1154,7 @@ static void do_tty_hangup(void *data) - send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); - if (tty->pgrp > 0) - p->signal->tty_old_pgrp = tty->pgrp; -- } while_each_task_pid(tty->session, PIDTYPE_SID, p); -+ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); - } - read_unlock(&tasklist_lock); - -@@ -1218,9 +1271,9 @@ void disassociate_ctty(int on_exit) - - /* Now clear signal->tty under the lock */ - read_lock(&tasklist_lock); -- do_each_task_pid(current->signal->session, PIDTYPE_SID, p) { -+ do_each_task_pid_all(current->signal->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; -- } while_each_task_pid(current->signal->session, PIDTYPE_SID, p); -+ } while_each_task_pid_all(current->signal->session, PIDTYPE_SID, p); - read_unlock(&tasklist_lock); - up(&tty_sem); - unlock_kernel(); -@@ -1446,21 +1499,28 @@ static inline void tty_line_name(struct - * really quite straightforward. The semaphore locking can probably be - * relaxed for the (most common) case of reopening a tty. - */ --static int init_dev(struct tty_driver *driver, int idx, -- struct tty_struct **ret_tty) -+static int init_dev(struct tty_driver *driver, int idx, -+ struct tty_struct *i_tty, struct tty_struct **ret_tty) - { - struct tty_struct *tty, *o_tty; - struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; - struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; -+ struct ve_struct * owner; - int retval=0; - -- /* check whether we're reopening an existing tty */ -- if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { -- tty = devpts_get_tty(idx); -- if (tty && driver->subtype == PTY_TYPE_MASTER) -- tty = tty->link; -- } else { -- tty = driver->ttys[idx]; -+ owner = VE_OWNER_TTYDRV(driver); -+ -+ if (i_tty) -+ tty = i_tty; -+ else { -+ /* check whether we're reopening an existing tty */ -+ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { -+ tty = devpts_get_tty(idx); -+ if (tty && driver->subtype == PTY_TYPE_MASTER) -+ tty = tty->link; -+ } else { -+ tty = driver->ttys[idx]; -+ } - } - if (tty) goto fast_track; - -@@ -1488,6 +1548,7 @@ static int init_dev(struct tty_driver *d - tty->driver = driver; - tty->index = idx; - tty_line_name(driver, idx, tty->name); -+ SET_VE_OWNER_TTY(tty, owner); - - if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { - tp_loc = &tty->termios; -@@ -1498,7 +1559,7 @@ static int init_dev(struct tty_driver *d - } - - if (!*tp_loc) { -- tp = (struct termios *) kmalloc(sizeof(struct termios), -+ tp = (struct termios *) ub_kmalloc(sizeof(struct termios), - GFP_KERNEL); - if (!tp) - goto free_mem_out; -@@ -1506,7 +1567,7 @@ static int init_dev(struct tty_driver *d - } - - if (!*ltp_loc) { -- ltp = (struct termios *) kmalloc(sizeof(struct termios), -+ ltp = (struct termios *) ub_kmalloc(sizeof(struct termios), - GFP_KERNEL); - if (!ltp) - goto free_mem_out; -@@ -1521,6 +1582,7 @@ static int init_dev(struct tty_driver *d - o_tty->driver = driver->other; - o_tty->index = idx; - tty_line_name(driver->other, idx, o_tty->name); -+ SET_VE_OWNER_TTY(o_tty, owner); - - if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { - o_tp_loc = &o_tty->termios; -@@ -1532,7 +1594,7 @@ static int init_dev(struct tty_driver *d - - if (!*o_tp_loc) { - o_tp = (struct termios *) -- kmalloc(sizeof(struct termios), GFP_KERNEL); -+ ub_kmalloc(sizeof(struct termios), GFP_KERNEL); - if (!o_tp) - goto free_mem_out; - *o_tp = driver->other->init_termios; -@@ -1540,7 +1602,7 @@ static int init_dev(struct tty_driver *d - - if (!*o_ltp_loc) { - o_ltp = (struct termios *) -- kmalloc(sizeof(struct termios), GFP_KERNEL); -+ ub_kmalloc(sizeof(struct termios), GFP_KERNEL); - if (!o_ltp) - goto free_mem_out; - memset(o_ltp, 0, sizeof(struct termios)); -@@ -1558,6 +1620,10 @@ static int init_dev(struct tty_driver *d - *o_ltp_loc = o_ltp; - o_tty->termios = *o_tp_loc; - o_tty->termios_locked = *o_ltp_loc; -+#ifdef CONFIG_VE -+ if (driver->other->refcount == 0) -+ (void)get_ve(owner); -+#endif - driver->other->refcount++; - if (driver->subtype == PTY_TYPE_MASTER) - o_tty->count++; -@@ -1582,6 +1648,10 @@ static int init_dev(struct tty_driver *d - *ltp_loc = ltp; - tty->termios = *tp_loc; - tty->termios_locked = *ltp_loc; -+#ifdef CONFIG_VE -+ if (driver->refcount == 0) -+ (void)get_ve(owner); -+#endif - driver->refcount++; - tty->count++; - -@@ -1692,6 +1762,10 @@ static void release_mem(struct tty_struc - } - o_tty->magic = 0; - o_tty->driver->refcount--; -+#ifdef CONFIG_VE -+ if (o_tty->driver->refcount == 0) -+ put_ve(VE_OWNER_TTY(o_tty)); -+#endif - file_list_lock(); - list_del_init(&o_tty->tty_files); - file_list_unlock(); -@@ -1714,6 +1788,10 @@ static void release_mem(struct tty_struc - - tty->magic = 0; - tty->driver->refcount--; -+#ifdef CONFIG_VE -+ if (tty->driver->refcount == 0) -+ put_ve(VE_OWNER_TTY(tty)); -+#endif - file_list_lock(); - list_del_init(&tty->tty_files); - file_list_unlock(); -@@ -1737,7 +1815,10 @@ static void release_dev(struct file * fi - int idx; - char buf[64]; - unsigned long flags; -- -+#ifdef CONFIG_UNIX98_PTYS -+ struct idr *idr_alloced; -+#endif -+ - tty = (struct tty_struct *)filp->private_data; - if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "release_dev")) - return; -@@ -1752,6 +1833,9 @@ static void release_dev(struct file * fi - devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0; - devpts_master = pty_master && devpts; - o_tty = tty->link; -+#ifdef CONFIG_UNIX98_PTYS -+ idr_alloced = &__ve_allocated_ptys(tty->owner_env); -+#endif - - #ifdef TTY_PARANOIA_CHECK - if (idx < 0 || idx >= tty->driver->num) { -@@ -1924,13 +2008,13 @@ static void release_dev(struct file * fi - struct task_struct *p; - - read_lock(&tasklist_lock); -- do_each_task_pid(tty->session, PIDTYPE_SID, p) { -+ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; -- } while_each_task_pid(tty->session, PIDTYPE_SID, p); -+ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); - if (o_tty) -- do_each_task_pid(o_tty->session, PIDTYPE_SID, p) { -+ do_each_task_pid_all(o_tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; -- } while_each_task_pid(o_tty->session, PIDTYPE_SID, p); -+ } while_each_task_pid_all(o_tty->session, PIDTYPE_SID, p); - read_unlock(&tasklist_lock); - } - -@@ -2005,7 +2089,7 @@ static void release_dev(struct file * fi - /* Make this pty number available for reallocation */ - if (devpts) { - down(&allocated_ptys_lock); -- idr_remove(&allocated_ptys, idx); -+ idr_remove(idr_alloced, idx); - up(&allocated_ptys_lock); - } - #endif -@@ -2026,7 +2110,7 @@ static void release_dev(struct file * fi - */ - static int tty_open(struct inode * inode, struct file * filp) - { -- struct tty_struct *tty; -+ struct tty_struct *tty, *c_tty; - int noctty, retval; - struct tty_driver *driver; - int index; -@@ -2039,6 +2123,7 @@ retry_open: - noctty = filp->f_flags & O_NOCTTY; - index = -1; - retval = 0; -+ c_tty = NULL; - - down(&tty_sem); - -@@ -2049,6 +2134,7 @@ retry_open: - } - driver = current->signal->tty->driver; - index = current->signal->tty->index; -+ c_tty = current->signal->tty; - filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ - /* noctty = 1; */ - goto got_driver; -@@ -2056,6 +2142,12 @@ retry_open: - #ifdef CONFIG_VT - if (device == MKDEV(TTY_MAJOR,0)) { - extern struct tty_driver *console_driver; -+#ifdef CONFIG_VE -+ if (!ve_is_super(get_exec_env())) { -+ up(&tty_sem); -+ return -ENODEV; -+ } -+#endif - driver = console_driver; - index = fg_console; - noctty = 1; -@@ -2063,6 +2155,12 @@ retry_open: - } - #endif - if (device == MKDEV(TTYAUX_MAJOR,1)) { -+#ifdef CONFIG_VE -+ if (!ve_is_super(get_exec_env())) { -+ up(&tty_sem); -+ return -ENODEV; -+ } -+#endif - driver = console_device(&index); - if (driver) { - /* Don't let /dev/console block */ -@@ -2080,7 +2178,7 @@ retry_open: - return -ENODEV; - } - got_driver: -- retval = init_dev(driver, index, &tty); -+ retval = init_dev(driver, index, c_tty, &tty); - up(&tty_sem); - if (retval) - return retval; -@@ -2149,11 +2247,11 @@ static int ptmx_open(struct inode * inod - - /* find a device that is not in use. */ - down(&allocated_ptys_lock); -- if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { -+ if (!idr_pre_get(&ve_allocated_ptys, GFP_KERNEL)) { - up(&allocated_ptys_lock); - return -ENOMEM; - } -- idr_ret = idr_get_new(&allocated_ptys, NULL, &index); -+ idr_ret = idr_get_new(&ve_allocated_ptys, NULL, &index); - if (idr_ret < 0) { - up(&allocated_ptys_lock); - if (idr_ret == -EAGAIN) -@@ -2161,14 +2259,14 @@ static int ptmx_open(struct inode * inod - return -EIO; - } - if (index >= pty_limit) { -- idr_remove(&allocated_ptys, index); -+ idr_remove(&ve_allocated_ptys, index); - up(&allocated_ptys_lock); - return -EIO; - } - up(&allocated_ptys_lock); - - down(&tty_sem); -- retval = init_dev(ptm_driver, index, &tty); -+ retval = init_dev(ve_ptm_driver, index, NULL, &tty); - up(&tty_sem); - - if (retval) -@@ -2183,14 +2281,14 @@ static int ptmx_open(struct inode * inod - goto out1; - - check_tty_count(tty, "tty_open"); -- retval = ptm_driver->open(tty, filp); -+ retval = ve_ptm_driver->open(tty, filp); - if (!retval) - return 0; - out1: - release_dev(filp); - out: - down(&allocated_ptys_lock); -- idr_remove(&allocated_ptys, index); -+ idr_remove(&ve_allocated_ptys, index); - up(&allocated_ptys_lock); - return retval; - } -@@ -2303,6 +2401,8 @@ static int tioccons(struct file *file) - { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; -+ if (!ve_is_super(get_exec_env())) -+ return -EACCES; - if (file->f_op->write == redirected_tty_write) { - struct file *f; - spin_lock(&redirect_lock); -@@ -2363,9 +2463,9 @@ static int tiocsctty(struct tty_struct * - */ - - read_lock(&tasklist_lock); -- do_each_task_pid(tty->session, PIDTYPE_SID, p) { -+ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; -- } while_each_task_pid(tty->session, PIDTYPE_SID, p); -+ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); - read_unlock(&tasklist_lock); - } else - return -EPERM; -@@ -2387,7 +2487,7 @@ static int tiocgpgrp(struct tty_struct * - */ - if (tty == real_tty && current->signal->tty != real_tty) - return -ENOTTY; -- return put_user(real_tty->pgrp, p); -+ return put_user(pid_type_to_vpid(PIDTYPE_PGID, real_tty->pgrp), p); - } - - static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) -@@ -2407,6 +2507,9 @@ static int tiocspgrp(struct tty_struct * - return -EFAULT; - if (pgrp < 0) - return -EINVAL; -+ pgrp = vpid_to_pid(pgrp); -+ if (pgrp < 0) -+ return -EPERM; - if (session_of_pgrp(pgrp) != current->signal->session) - return -EPERM; - real_tty->pgrp = pgrp; -@@ -2423,7 +2526,7 @@ static int tiocgsid(struct tty_struct *t - return -ENOTTY; - if (real_tty->session <= 0) - return -ENOTTY; -- return put_user(real_tty->session, p); -+ return put_user(pid_type_to_vpid(PIDTYPE_SID, real_tty->session), p); - } - - static int tiocsetd(struct tty_struct *tty, int __user *p) -@@ -2696,7 +2799,7 @@ static void __do_SAK(void *arg) - tty->driver->flush_buffer(tty); - - read_lock(&tasklist_lock); -- do_each_task_pid(session, PIDTYPE_SID, p) { -+ do_each_task_pid_all(session, PIDTYPE_SID, p) { - if (p->signal->tty == tty || session > 0) { - printk(KERN_NOTICE "SAK: killed process %d" - " (%s): p->signal->session==tty->session\n", -@@ -2724,7 +2827,7 @@ static void __do_SAK(void *arg) - rcu_read_unlock(); - } - task_unlock(p); -- } while_each_task_pid(session, PIDTYPE_SID, p); -+ } while_each_task_pid_all(session, PIDTYPE_SID, p); - read_unlock(&tasklist_lock); - #endif - } -@@ -3095,8 +3198,11 @@ int tty_register_driver(struct tty_drive - - if (!driver->put_char) - driver->put_char = tty_default_put_char; -- -+ -+ SET_VE_OWNER_TTYDRV(driver, get_exec_env()); -+ write_lock_irq(&tty_driver_guard); - list_add(&driver->tty_drivers, &tty_drivers); -+ write_unlock_irq(&tty_driver_guard); - - if ( !(driver->flags & TTY_DRIVER_NO_DEVFS) ) { - for(i = 0; i < driver->num; i++) -@@ -3123,7 +3229,9 @@ int tty_unregister_driver(struct tty_dri - unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), - driver->num); - -+ write_lock_irq(&tty_driver_guard); - list_del(&driver->tty_drivers); -+ write_unlock_irq(&tty_driver_guard); - - /* - * Free the termios and termios_locked structures because -@@ -3246,6 +3354,7 @@ static int __init tty_init(void) - - vty_init(); - #endif -+ prepare_tty(); - return 0; - } - module_init(tty_init); -diff -upr linux-2.6.16.orig/drivers/edac/Kconfig linux-2.6.16-026test009/drivers/edac/Kconfig ---- linux-2.6.16.orig/drivers/edac/Kconfig 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/edac/Kconfig 2006-04-19 15:02:11.000000000 +0400 -@@ -71,7 +71,7 @@ config EDAC_E7XXX - - config EDAC_E752X - tristate "Intel e752x (e7520, e7525, e7320)" -- depends on EDAC_MM_EDAC && PCI -+ depends on EDAC_MM_EDAC && PCI && HOTPLUG - help - Support for error detection and correction on the Intel - E7520, E7525, E7320 server chipsets. -diff -upr linux-2.6.16.orig/drivers/ieee1394/sbp2.c linux-2.6.16-026test009/drivers/ieee1394/sbp2.c ---- linux-2.6.16.orig/drivers/ieee1394/sbp2.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/ieee1394/sbp2.c 2006-04-19 15:02:11.000000000 +0400 -@@ -495,22 +495,17 @@ static struct sbp2_command_info *sbp2uti - /* - * This function finds the sbp2_command for a given outstanding SCpnt. - * Only looks at the inuse list. -+ * Must be called with scsi_id->sbp2_command_orb_lock held. - */ --static struct sbp2_command_info *sbp2util_find_command_for_SCpnt(struct scsi_id_instance_data *scsi_id, void *SCpnt) -+static struct sbp2_command_info *sbp2util_find_command_for_SCpnt( -+ struct scsi_id_instance_data *scsi_id, void *SCpnt) - { - struct sbp2_command_info *command; -- unsigned long flags; - -- spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); -- if (!list_empty(&scsi_id->sbp2_command_orb_inuse)) { -- list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) { -- if (command->Current_SCpnt == SCpnt) { -- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); -+ if (!list_empty(&scsi_id->sbp2_command_orb_inuse)) -+ list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) -+ if (command->Current_SCpnt == SCpnt) - return command; -- } -- } -- } -- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); - return NULL; - } - -@@ -579,17 +574,15 @@ static void sbp2util_free_command_dma(st - - /* - * This function moves a command to the completed orb list. -+ * Must be called with scsi_id->sbp2_command_orb_lock held. - */ --static void sbp2util_mark_command_completed(struct scsi_id_instance_data *scsi_id, -- struct sbp2_command_info *command) -+static void sbp2util_mark_command_completed( -+ struct scsi_id_instance_data *scsi_id, -+ struct sbp2_command_info *command) - { -- unsigned long flags; -- -- spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); - list_del(&command->list); - sbp2util_free_command_dma(command); - list_add_tail(&command->list, &scsi_id->sbp2_command_orb_completed); -- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); - } - - /* -@@ -2177,7 +2170,9 @@ static int sbp2_handle_status_write(stru - * Matched status with command, now grab scsi command pointers and check status - */ - SCpnt = command->Current_SCpnt; -+ spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); - sbp2util_mark_command_completed(scsi_id, command); -+ spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); - - if (SCpnt) { - -@@ -2513,6 +2508,7 @@ static int sbp2scsi_abort(struct scsi_cm - (struct scsi_id_instance_data *)SCpnt->device->host->hostdata[0]; - struct sbp2scsi_host_info *hi = scsi_id->hi; - struct sbp2_command_info *command; -+ unsigned long flags; - - SBP2_ERR("aborting sbp2 command"); - scsi_print_command(SCpnt); -@@ -2523,6 +2519,7 @@ static int sbp2scsi_abort(struct scsi_cm - * Right now, just return any matching command structures - * to the free pool. - */ -+ spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); - command = sbp2util_find_command_for_SCpnt(scsi_id, SCpnt); - if (command) { - SBP2_DEBUG("Found command to abort"); -@@ -2540,6 +2537,7 @@ static int sbp2scsi_abort(struct scsi_cm - command->Current_done(command->Current_SCpnt); - } - } -+ spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); - - /* - * Initiate a fetch agent reset. -diff -upr linux-2.6.16.orig/drivers/md/dm.c linux-2.6.16-026test009/drivers/md/dm.c ---- linux-2.6.16.orig/drivers/md/dm.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/md/dm.c 2006-04-19 15:02:11.000000000 +0400 -@@ -533,30 +533,35 @@ static void __clone_and_map(struct clone - - } else { - /* -- * Create two copy bios to deal with io that has -- * been split across a target. -+ * Handle a bvec that must be split between two or more targets. - */ - struct bio_vec *bv = bio->bi_io_vec + ci->idx; -+ sector_t remaining = to_sector(bv->bv_len); -+ unsigned int offset = 0; - -- clone = split_bvec(bio, ci->sector, ci->idx, -- bv->bv_offset, max); -- __map_bio(ti, clone, tio); -- -- ci->sector += max; -- ci->sector_count -= max; -- ti = dm_table_find_target(ci->map, ci->sector); -- -- len = to_sector(bv->bv_len) - max; -- clone = split_bvec(bio, ci->sector, ci->idx, -- bv->bv_offset + to_bytes(max), len); -- tio = alloc_tio(ci->md); -- tio->io = ci->io; -- tio->ti = ti; -- memset(&tio->info, 0, sizeof(tio->info)); -- __map_bio(ti, clone, tio); -+ do { -+ if (offset) { -+ ti = dm_table_find_target(ci->map, ci->sector); -+ max = max_io_len(ci->md, ci->sector, ti); -+ -+ tio = alloc_tio(ci->md); -+ tio->io = ci->io; -+ tio->ti = ti; -+ memset(&tio->info, 0, sizeof(tio->info)); -+ } -+ -+ len = min(remaining, max); -+ -+ clone = split_bvec(bio, ci->sector, ci->idx, -+ bv->bv_offset + offset, len); -+ -+ __map_bio(ti, clone, tio); -+ -+ ci->sector += len; -+ ci->sector_count -= len; -+ offset += to_bytes(len); -+ } while (remaining -= len); - -- ci->sector += len; -- ci->sector_count -= len; - ci->idx++; - } - } -diff -upr linux-2.6.16.orig/drivers/media/video/Kconfig linux-2.6.16-026test009/drivers/media/video/Kconfig ---- linux-2.6.16.orig/drivers/media/video/Kconfig 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/media/video/Kconfig 2006-04-19 15:02:11.000000000 +0400 -@@ -349,6 +349,7 @@ config VIDEO_AUDIO_DECODER - config VIDEO_DECODER - tristate "Add support for additional video chipsets" - depends on VIDEO_DEV && I2C && EXPERIMENTAL -+ select FW_LOADER - ---help--- - Say Y here to compile drivers for SAA7115, SAA7127 and CX25840 - video decoders. -diff -upr linux-2.6.16.orig/drivers/media/video/tuner-types.c linux-2.6.16-026test009/drivers/media/video/tuner-types.c ---- linux-2.6.16.orig/drivers/media/video/tuner-types.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/media/video/tuner-types.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1087,8 +1087,8 @@ static struct tuner_params tuner_tnf_533 - /* ------------ TUNER_SAMSUNG_TCPN_2121P30A - Samsung NTSC ------------ */ - - static struct tuner_range tuner_samsung_tcpn_2121p30a_ntsc_ranges[] = { -- { 16 * 175.75 /*MHz*/, 0x01, }, -- { 16 * 410.25 /*MHz*/, 0x02, }, -+ { 16 * 130.00 /*MHz*/, 0x01, }, -+ { 16 * 364.50 /*MHz*/, 0x02, }, - { 16 * 999.99 , 0x08, }, - }; - -diff -upr linux-2.6.16.orig/drivers/net/Makefile linux-2.6.16-026test009/drivers/net/Makefile ---- linux-2.6.16.orig/drivers/net/Makefile 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/Makefile 2006-04-19 15:02:12.000000000 +0400 -@@ -18,6 +18,9 @@ gianfar_driver-objs := gianfar.o \ - gianfar_mii.o \ - gianfar_sysfs.o - -+obj-$(CONFIG_VE_NETDEV) += vznetdev.o -+vznetdev-objs := open_vznet.o venet_core.o -+ - # - # link order important here - # -diff -upr linux-2.6.16.orig/drivers/net/irda/irda-usb.c linux-2.6.16-026test009/drivers/net/irda/irda-usb.c ---- linux-2.6.16.orig/drivers/net/irda/irda-usb.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/irda/irda-usb.c 2006-04-19 15:02:11.000000000 +0400 -@@ -740,7 +740,7 @@ static void irda_usb_receive(struct urb - struct sk_buff *newskb; - struct sk_buff *dataskb; - struct urb *next_urb; -- int docopy; -+ unsigned int len, docopy; - - IRDA_DEBUG(2, "%s(), len=%d\n", __FUNCTION__, urb->actual_length); - -@@ -851,10 +851,11 @@ static void irda_usb_receive(struct urb - dataskb->dev = self->netdev; - dataskb->mac.raw = dataskb->data; - dataskb->protocol = htons(ETH_P_IRDA); -+ len = dataskb->len; - netif_rx(dataskb); - - /* Keep stats up to date */ -- self->stats.rx_bytes += dataskb->len; -+ self->stats.rx_bytes += len; - self->stats.rx_packets++; - self->netdev->last_rx = jiffies; - -diff -upr linux-2.6.16.orig/drivers/net/loopback.c linux-2.6.16-026test009/drivers/net/loopback.c ---- linux-2.6.16.orig/drivers/net/loopback.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/loopback.c 2006-04-19 15:02:12.000000000 +0400 -@@ -130,6 +130,11 @@ static int loopback_xmit(struct sk_buff - { - struct net_device_stats *lb_stats; - -+ if (unlikely(get_exec_env()->disable_net)) { -+ kfree_skb(skb); -+ return 0; -+ } -+ - skb_orphan(skb); - - skb->protocol = eth_type_trans(skb,dev); -@@ -198,6 +203,34 @@ static struct ethtool_ops loopback_ethto - .set_tso = ethtool_op_set_tso, - }; - -+static void loopback_destructor(struct net_device *dev) -+{ -+ kfree(dev->priv); -+ dev->priv = NULL; -+} -+ -+struct net_device templ_loopback_dev = { -+ .name = "lo", -+ .mtu = (16 * 1024) + 20 + 20 + 12, -+ .hard_start_xmit = loopback_xmit, -+ .hard_header = eth_header, -+ .hard_header_cache = eth_header_cache, -+ .header_cache_update = eth_header_cache_update, -+ .hard_header_len = ETH_HLEN, /* 14 */ -+ .addr_len = ETH_ALEN, /* 6 */ -+ .tx_queue_len = 0, -+ .type = ARPHRD_LOOPBACK, /* 0x0001*/ -+ .rebuild_header = eth_rebuild_header, -+ .flags = IFF_LOOPBACK, -+ .features = NETIF_F_SG|NETIF_F_FRAGLIST -+ |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA -+ |NETIF_F_LLTX|NETIF_F_VIRTUAL, -+}; -+ -+#ifdef loopback_dev -+#undef loopback_dev -+#endif -+ - struct net_device loopback_dev = { - .name = "lo", - .mtu = (16 * 1024) + 20 + 20 + 12, -@@ -231,9 +264,13 @@ int __init loopback_init(void) - memset(stats, 0, sizeof(struct net_device_stats)); - loopback_dev.priv = stats; - loopback_dev.get_stats = &get_stats; -+ loopback_dev.destructor = &loopback_destructor; - } -- -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ get_ve0()->_loopback_dev = &loopback_dev; -+#endif - return register_netdev(&loopback_dev); - }; - - EXPORT_SYMBOL(loopback_dev); -+EXPORT_SYMBOL(templ_loopback_dev); -diff -upr linux-2.6.16.orig/drivers/net/open_vznet.c linux-2.6.16-026test009/drivers/net/open_vznet.c ---- linux-2.6.16.orig/drivers/net/open_vznet.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/open_vznet.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,190 @@ -+/* -+ * open_vznet.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+/* -+ * Virtual Networking device used to change VE ownership on packets -+ */ -+ -+#include <linux/kernel.h> -+#include <linux/module.h> -+#include <linux/seq_file.h> -+ -+#include <linux/inet.h> -+#include <net/ip.h> -+#include <linux/skbuff.h> -+#include <linux/venet.h> -+ -+void veip_stop(struct ve_struct *ve) -+{ -+ struct list_head *p, *tmp; -+ -+ write_lock_irq(&veip_hash_lock); -+ if (ve->veip == NULL) -+ goto unlock; -+ list_for_each_safe(p, tmp, &ve->veip->ip_lh) { -+ struct ip_entry_struct *ptr; -+ ptr = list_entry(p, struct ip_entry_struct, ve_list); -+ ptr->active_env = NULL; -+ list_del(&ptr->ve_list); -+ list_del(&ptr->ip_hash); -+ kfree(ptr); -+ } -+ veip_put(ve->veip); -+ ve->veip = NULL; -+unlock: -+ write_unlock_irq(&veip_hash_lock); -+} -+ -+int veip_start(struct ve_struct *ve) -+{ -+ int err; -+ -+ err = 0; -+ write_lock_irq(&veip_hash_lock); -+ ve->veip = veip_findcreate(ve->veid); -+ if (ve->veip == NULL) -+ err = -ENOMEM; -+ write_unlock_irq(&veip_hash_lock); -+ return err; -+} -+ -+int veip_entry_add(struct ve_struct *ve, struct sockaddr_in *addr) -+{ -+ struct ip_entry_struct *entry, *found; -+ int err; -+ -+ entry = kmalloc(sizeof(struct ip_entry_struct), GFP_KERNEL); -+ if (entry == NULL) -+ return -ENOMEM; -+ -+ memset(entry, 0, sizeof(struct ip_entry_struct)); -+ entry->ip = addr->sin_addr.s_addr; -+ -+ write_lock_irq(&veip_hash_lock); -+ err = -EADDRINUSE; -+ found = ip_entry_lookup(entry->ip); -+ if (found != NULL) -+ goto out_unlock; -+ else { -+ ip_entry_hash(entry, ve->veip); -+ found = entry; -+ entry = NULL; -+ } -+ err = 0; -+ found->active_env = ve; -+out_unlock: -+ write_unlock_irq(&veip_hash_lock); -+ if (entry != NULL) -+ kfree(entry); -+ return err; -+} -+ -+int veip_entry_del(envid_t veid, struct sockaddr_in *addr) -+{ -+ struct ip_entry_struct *found; -+ int err; -+ -+ err = -EADDRNOTAVAIL; -+ write_lock_irq(&veip_hash_lock); -+ found = ip_entry_lookup(addr->sin_addr.s_addr); -+ if (found == NULL) -+ goto out; -+ if (found->active_env->veid != veid) -+ goto out; -+ -+ err = 0; -+ found->active_env = NULL; -+ -+ list_del(&found->ip_hash); -+ list_del(&found->ve_list); -+ kfree(found); -+out: -+ write_unlock_irq(&veip_hash_lock); -+ return err; -+} -+ -+static struct ve_struct *venet_find_ve(__u32 ip) -+{ -+ struct ip_entry_struct *entry; -+ -+ entry = ip_entry_lookup(ip); -+ if (entry == NULL) -+ return NULL; -+ -+ return entry->active_env; -+} -+ -+int venet_change_skb_owner(struct sk_buff *skb) -+{ -+ struct ve_struct *ve, *ve_old; -+ struct iphdr *iph; -+ -+ ve_old = skb->owner_env; -+ iph = skb->nh.iph; -+ -+ read_lock(&veip_hash_lock); -+ if (!ve_is_super(ve_old)) { -+ /* from VE to host */ -+ ve = venet_find_ve(iph->saddr); -+ if (ve == NULL) -+ goto out_drop; -+ if (!ve_accessible_strict(ve, ve_old)) -+ goto out_source; -+ skb->owner_env = get_ve0(); -+ } else { -+ /* from host to VE */ -+ ve = venet_find_ve(iph->daddr); -+ if (ve == NULL) -+ goto out_drop; -+ skb->owner_env = ve; -+ } -+ read_unlock(&veip_hash_lock); -+ -+ return 0; -+ -+out_drop: -+ read_unlock(&veip_hash_lock); -+ return -ESRCH; -+ -+out_source: -+ read_unlock(&veip_hash_lock); -+ if (net_ratelimit()) { -+ printk(KERN_WARNING "Dropped packet, source wrong " -+ "veid=%u src-IP=%u.%u.%u.%u " -+ "dst-IP=%u.%u.%u.%u\n", -+ skb->owner_env->veid, -+ NIPQUAD(skb->nh.iph->saddr), -+ NIPQUAD(skb->nh.iph->daddr)); -+ } -+ return -EACCES; -+} -+ -+#ifdef CONFIG_PROC_FS -+int veip_seq_show(struct seq_file *m, void *v) -+{ -+ struct list_head *p; -+ struct ip_entry_struct *entry; -+ char s[16]; -+ -+ p = (struct list_head *)v; -+ if (p == ip_entry_hash_table) { -+ seq_puts(m, "Version: 2.5\n"); -+ return 0; -+ } -+ entry = list_entry(p, struct ip_entry_struct, ip_hash); -+ sprintf(s, "%u.%u.%u.%u", NIPQUAD(entry->ip)); -+ seq_printf(m, "%15s %10u\n", s, 0); -+ return 0; -+} -+#endif -+ -+MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); -+MODULE_DESCRIPTION("Virtuozzo Virtual Network Device"); -+MODULE_LICENSE("GPL v2"); -diff -upr linux-2.6.16.orig/drivers/net/sky2.c linux-2.6.16-026test009/drivers/net/sky2.c ---- linux-2.6.16.orig/drivers/net/sky2.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/sky2.c 2006-04-19 15:02:11.000000000 +0400 -@@ -579,8 +579,8 @@ static void sky2_mac_init(struct sky2_hw - reg = gma_read16(hw, port, GM_PHY_ADDR); - gma_write16(hw, port, GM_PHY_ADDR, reg | GM_PAR_MIB_CLR); - -- for (i = 0; i < GM_MIB_CNT_SIZE; i++) -- gma_read16(hw, port, GM_MIB_CNT_BASE + 8 * i); -+ for (i = GM_MIB_CNT_BASE; i <= GM_MIB_CNT_END; i += 4) -+ gma_read16(hw, port, i); - gma_write16(hw, port, GM_PHY_ADDR, reg); - - /* transmit control */ -diff -upr linux-2.6.16.orig/drivers/net/sky2.h linux-2.6.16-026test009/drivers/net/sky2.h ---- linux-2.6.16.orig/drivers/net/sky2.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/sky2.h 2006-04-19 15:02:11.000000000 +0400 -@@ -1380,6 +1380,7 @@ enum { - /* MIB Counters */ - #define GM_MIB_CNT_BASE 0x0100 /* Base Address of MIB Counters */ - #define GM_MIB_CNT_SIZE 44 /* Number of MIB Counters */ -+#define GM_MIB_CNT_END 0x025C /* Last MIB counter */ - - /* - * MIB Counters base address definitions (low word) - -diff -upr linux-2.6.16.orig/drivers/net/tun.c linux-2.6.16-026test009/drivers/net/tun.c ---- linux-2.6.16.orig/drivers/net/tun.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/tun.c 2006-04-19 15:02:12.000000000 +0400 -@@ -62,6 +62,7 @@ - - #include <asm/system.h> - #include <asm/uaccess.h> -+#include <ub/beancounter.h> - - #ifdef TUN_DEBUG - static int debug; -@@ -90,6 +91,7 @@ static int tun_net_close(struct net_devi - static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev) - { - struct tun_struct *tun = netdev_priv(dev); -+ struct user_beancounter *ub; - - DBG(KERN_INFO "%s: tun_net_xmit %d\n", tun->dev->name, skb->len); - -@@ -114,6 +116,18 @@ static int tun_net_xmit(struct sk_buff * - } - } - -+ ub = netdev_bc(dev)->exec_ub; -+ if (ub && (skb_bc(skb)->charged == 0)) { -+ unsigned long charge; -+ charge = skb_charge_fullsize(skb); -+ if (charge_beancounter(ub, UB_OTHERSOCKBUF, charge, 1)) -+ goto drop; -+ get_beancounter(ub); -+ skb_bc(skb)->ub = ub; -+ skb_bc(skb)->charged = charge; -+ skb_bc(skb)->resource = UB_OTHERSOCKBUF; -+ } -+ - /* Queue packet */ - skb_queue_tail(&tun->readq, skb); - dev->trans_start = jiffies; -@@ -410,12 +424,14 @@ static ssize_t tun_chr_readv(struct file - tun->dev->name, addr[0], addr[1], addr[2], - addr[3], addr[4], addr[5]); - ret = tun_put_user(tun, skb, (struct iovec *) iv, len); -+ /* skb will be uncharged in kfree_skb() */ - kfree_skb(skb); - break; - } else { - DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %x:%x:%x:%x:%x:%x\n", - tun->dev->name, addr[0], addr[1], addr[2], - addr[3], addr[4], addr[5]); -+ /* skb will be uncharged in kfree_skb() */ - kfree_skb(skb); - continue; - } -@@ -451,6 +467,7 @@ static void tun_setup(struct net_device - dev->get_stats = tun_net_stats; - dev->ethtool_ops = &tun_ethtool_ops; - dev->destructor = free_netdev; -+ dev->features |= NETIF_F_VIRTUAL; - } - - static struct tun_struct *tun_get_by_name(const char *name) -@@ -459,8 +476,9 @@ static struct tun_struct *tun_get_by_nam - - ASSERT_RTNL(); - list_for_each_entry(tun, &tun_dev_list, list) { -- if (!strncmp(tun->dev->name, name, IFNAMSIZ)) -- return tun; -+ if (ve_accessible_strict(tun->dev->owner_env, get_exec_env()) && -+ !strncmp(tun->dev->name, name, IFNAMSIZ)) -+ return tun; - } - - return NULL; -@@ -479,7 +497,8 @@ static int tun_set_iff(struct file *file - - /* Check permissions */ - if (tun->owner != -1 && -- current->euid != tun->owner && !capable(CAP_NET_ADMIN)) -+ current->euid != tun->owner && -+ !capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) - return -EPERM; - } - else if (__dev_get_by_name(ifr->ifr_name)) -diff -upr linux-2.6.16.orig/drivers/net/venet_core.c linux-2.6.16-026test009/drivers/net/venet_core.c ---- linux-2.6.16.orig/drivers/net/venet_core.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/venet_core.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,626 @@ -+/* -+ * venet_core.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+/* -+ * Common part for Virtuozzo virtual network devices -+ */ -+ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/interrupt.h> -+#include <linux/fs.h> -+#include <linux/types.h> -+#include <linux/string.h> -+#include <linux/socket.h> -+#include <linux/errno.h> -+#include <linux/fcntl.h> -+#include <linux/in.h> -+#include <linux/init.h> -+#include <linux/module.h> -+#include <linux/tcp.h> -+#include <linux/proc_fs.h> -+#include <linux/seq_file.h> -+ -+#include <asm/system.h> -+#include <asm/uaccess.h> -+#include <asm/io.h> -+#include <asm/unistd.h> -+ -+#include <linux/inet.h> -+#include <linux/netdevice.h> -+#include <linux/etherdevice.h> -+#include <net/ip.h> -+#include <linux/skbuff.h> -+#include <net/sock.h> -+#include <linux/if_ether.h> /* For the statistics structure. */ -+#include <linux/if_arp.h> /* For ARPHRD_ETHER */ -+#include <linux/venet.h> -+#include <linux/ve_proto.h> -+#include <linux/vzctl.h> -+#include <linux/vzctl_venet.h> -+ -+struct list_head ip_entry_hash_table[VEIP_HASH_SZ]; -+rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED; -+LIST_HEAD(veip_lh); -+ -+#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1)) -+ -+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip) -+{ -+ list_add(&entry->ip_hash, -+ ip_entry_hash_table + ip_entry_hash_function(entry->ip)); -+ list_add(&entry->ve_list, &veip->ip_lh); -+} -+ -+void veip_put(struct veip_struct *veip) -+{ -+ if (!list_empty(&veip->ip_lh)) -+ return; -+ if (!list_empty(&veip->src_lh)) -+ return; -+ if (!list_empty(&veip->dst_lh)) -+ return; -+ -+ list_del(&veip->list); -+ kfree(veip); -+} -+ -+struct ip_entry_struct *ip_entry_lookup(u32 addr) -+{ -+ struct ip_entry_struct *entry; -+ struct list_head *tmp; -+ -+ list_for_each(tmp, ip_entry_hash_table + ip_entry_hash_function(addr)) { -+ entry = list_entry(tmp, struct ip_entry_struct, ip_hash); -+ if (entry->ip != addr) -+ continue; -+ return entry; -+ } -+ return NULL; -+} -+ -+struct veip_struct *veip_find(envid_t veid) -+{ -+ struct veip_struct *ptr; -+ list_for_each_entry(ptr, &veip_lh, list) { -+ if (ptr->veid != veid) -+ continue; -+ return ptr; -+ } -+ return NULL; -+} -+ -+struct veip_struct *veip_findcreate(envid_t veid) -+{ -+ struct veip_struct *ptr; -+ -+ ptr = veip_find(veid); -+ if (ptr != NULL) -+ return ptr; -+ -+ ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC); -+ if (ptr == NULL) -+ return NULL; -+ memset(ptr, 0, sizeof(struct veip_struct)); -+ INIT_LIST_HEAD(&ptr->ip_lh); -+ INIT_LIST_HEAD(&ptr->src_lh); -+ INIT_LIST_HEAD(&ptr->dst_lh); -+ list_add(&ptr->list, &veip_lh); -+ ptr->veid = veid; -+ return ptr; -+} -+ -+/* -+ * Device functions -+ */ -+ -+static int venet_open(struct net_device *dev) -+{ -+ if (!try_module_get(THIS_MODULE)) -+ return -EBUSY; -+ return 0; -+} -+ -+static int venet_close(struct net_device *master) -+{ -+ module_put(THIS_MODULE); -+ return 0; -+} -+ -+static void venet_destructor(struct net_device *dev) -+{ -+ kfree(dev->priv); -+ dev->priv = NULL; -+} -+ -+/* -+ * The higher levels take care of making this non-reentrant (it's -+ * called with bh's disabled). -+ */ -+static int venet_xmit(struct sk_buff *skb, struct net_device *dev) -+{ -+ struct net_device_stats *stats = (struct net_device_stats *)dev->priv; -+ struct net_device *rcv = NULL; -+ struct iphdr *iph; -+ int length; -+ -+ if (unlikely(get_exec_env()->disable_net)) -+ goto outf; -+ -+ /* -+ * Optimise so buffers with skb->free=1 are not copied but -+ * instead are lobbed from tx queue to rx queue -+ */ -+ if (atomic_read(&skb->users) != 1) { -+ struct sk_buff *skb2 = skb; -+ skb = skb_clone(skb, GFP_ATOMIC); /* Clone the buffer */ -+ if (skb == NULL) { -+ kfree_skb(skb2); -+ goto out; -+ } -+ kfree_skb(skb2); -+ } else -+ skb_orphan(skb); -+ -+ if (skb->protocol != __constant_htons(ETH_P_IP)) -+ goto outf; -+ -+ iph = skb->nh.iph; -+ if (MULTICAST(iph->daddr)) -+ goto outf; -+ -+ if (venet_change_skb_owner(skb) < 0) -+ goto outf; -+ -+ if (unlikely(VE_OWNER_SKB(skb)->disable_net)) -+ goto outf; -+ -+ rcv = VE_OWNER_SKB(skb)->_venet_dev; -+ if (!rcv) -+ /* VE going down */ -+ goto outf; -+ -+ dev_hold(rcv); -+ -+ if (!(rcv->flags & IFF_UP)) { -+ /* Target VE does not want to receive packets */ -+ dev_put(rcv); -+ goto outf; -+ } -+ -+ skb->pkt_type = PACKET_HOST; -+ skb->dev = rcv; -+ -+ skb->mac.raw = skb->data; -+ memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len); -+ -+ dst_release(skb->dst); -+ skb->dst = NULL; -+#ifdef CONFIG_NETFILTER -+ nf_conntrack_put(skb->nfct); -+ skb->nfct = NULL; -+#ifdef CONFIG_NETFILTER_DEBUG -+ skb->nf_debug = 0; -+#endif -+#endif -+ length = skb->len; -+ -+ netif_rx(skb); -+ -+ stats->tx_bytes += length; -+ stats->tx_packets++; -+ if (rcv) { -+ struct net_device_stats *rcv_stats = -+ (struct net_device_stats *)rcv->priv; -+ rcv_stats->rx_bytes += length; -+ rcv_stats->rx_packets++; -+ dev_put(rcv); -+ } -+ -+ return 0; -+ -+outf: -+ kfree_skb(skb); -+ ++stats->tx_dropped; -+out: -+ return 0; -+} -+ -+static struct net_device_stats *get_stats(struct net_device *dev) -+{ -+ return (struct net_device_stats *)dev->priv; -+} -+ -+/* Initialize the rest of the LOOPBACK device. */ -+int venet_init_dev(struct net_device *dev) -+{ -+ dev->hard_start_xmit = venet_xmit; -+ dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); -+ if (dev->priv == NULL) -+ return -ENOMEM; -+ memset(dev->priv, 0, sizeof(struct net_device_stats)); -+ dev->get_stats = get_stats; -+ dev->open = venet_open; -+ dev->stop = venet_close; -+ dev->destructor = venet_destructor; -+ -+ /* -+ * Fill in the generic fields of the device structure. -+ */ -+ dev->type = ARPHRD_VOID; -+ dev->hard_header_len = ETH_HLEN; -+ dev->mtu = 1500; /* eth_mtu */ -+ dev->tx_queue_len = 0; -+ -+ memset(dev->broadcast, 0xFF, ETH_ALEN); -+ -+ /* New-style flags. */ -+ dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT; -+ return 0; -+} -+ -+static void venet_setup(struct net_device *dev) -+{ -+ dev->init = venet_init_dev; -+ /* -+ * No other features, as they are: -+ * - checksumming is required, and nobody else will done our job -+ */ -+ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL; -+} -+ -+#ifdef CONFIG_PROC_FS -+static int veinfo_seq_show(struct seq_file *m, void *v) -+{ -+ struct ve_struct *ve = (struct ve_struct *)v; -+ struct list_head *tmp; -+ -+ seq_printf(m, "%10u %5u %5u", ve->veid, -+ ve->class_id, atomic_read(&ve->pcounter)); -+ read_lock(&veip_hash_lock); -+ if (ve->veip == NULL) -+ goto unlock; -+ list_for_each(tmp, &ve->veip->ip_lh) { -+ char ip[16]; -+ struct ip_entry_struct *entry; -+ -+ entry = list_entry(tmp, struct ip_entry_struct, ve_list); -+ if (entry->active_env == NULL) -+ continue; -+ -+ sprintf(ip, "%u.%u.%u.%u", NIPQUAD(entry->ip)); -+ seq_printf(m, " %15s", ip); -+ } -+unlock: -+ read_unlock(&veip_hash_lock); -+ seq_putc(m, '\n'); -+ return 0; -+} -+ -+static void *ve_seq_start(struct seq_file *m, loff_t *pos) -+{ -+ struct ve_struct *ve, *curve; -+ loff_t l; -+ -+ curve = get_exec_env(); -+ read_lock(&ve_list_guard); -+ if (!ve_is_super(curve)) { -+ if (*pos != 0) -+ return NULL; -+ return curve; -+ } -+ for (ve = ve_list_head, l = *pos; -+ ve != NULL && l > 0; -+ ve = ve->next, l--); -+ return ve; -+} -+ -+static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) -+{ -+ struct ve_struct *ve = (struct ve_struct *)v; -+ -+ if (!ve_is_super(get_exec_env())) -+ return NULL; -+ (*pos)++; -+ return ve->next; -+} -+ -+static void ve_seq_stop(struct seq_file *m, void *v) -+{ -+ read_unlock(&ve_list_guard); -+} -+ -+ -+static struct seq_operations veinfo_seq_op = { -+ start: ve_seq_start, -+ next: ve_seq_next, -+ stop: ve_seq_stop, -+ show: veinfo_seq_show -+}; -+ -+static int veinfo_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &veinfo_seq_op); -+} -+ -+static struct file_operations proc_veinfo_operations = { -+ open: veinfo_open, -+ read: seq_read, -+ llseek: seq_lseek, -+ release: seq_release -+}; -+ -+static void *veip_seq_start(struct seq_file *m, loff_t *pos) -+{ -+ loff_t l; -+ struct list_head *p; -+ int i; -+ -+ l = *pos; -+ write_lock_irq(&veip_hash_lock); -+ if (l == 0) -+ return ip_entry_hash_table; -+ for (i = 0; i < VEIP_HASH_SZ; i++) { -+ list_for_each(p, ip_entry_hash_table + i) { -+ if (--l == 0) -+ return p; -+ } -+ } -+ return NULL; -+} -+ -+static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos) -+{ -+ struct list_head *p; -+ -+ p = (struct list_head *)v; -+ while (1) { -+ p = p->next; -+ if (p < ip_entry_hash_table || -+ p >= ip_entry_hash_table + VEIP_HASH_SZ) { -+ (*pos)++; -+ return p; -+ } -+ if (++p >= ip_entry_hash_table + VEIP_HASH_SZ) -+ return NULL; -+ } -+ return NULL; -+} -+ -+static void veip_seq_stop(struct seq_file *m, void *v) -+{ -+ write_unlock_irq(&veip_hash_lock); -+} -+ -+static struct seq_operations veip_seq_op = { -+ start: veip_seq_start, -+ next: veip_seq_next, -+ stop: veip_seq_stop, -+ show: veip_seq_show -+}; -+ -+static int veip_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &veip_seq_op); -+} -+ -+static struct file_operations proc_veip_operations = { -+ open: veip_open, -+ read: seq_read, -+ llseek: seq_lseek, -+ release: seq_release -+}; -+#endif -+ -+int real_ve_ip_map(envid_t veid, int op, struct sockaddr *uservaddr, int addrlen) -+{ -+ int err; -+ struct sockaddr_in addr; -+ struct ve_struct *ve; -+ -+ err = -EPERM; -+ if (!capable(CAP_SETVEID)) -+ goto out; -+ -+ err = -EINVAL; -+ if (addrlen != sizeof(struct sockaddr_in)) -+ goto out; -+ -+ err = move_addr_to_kernel(uservaddr, addrlen, &addr); -+ if (err < 0) -+ goto out; -+ -+ switch (op) -+ { -+ case VE_IP_ADD: -+ ve = get_ve_by_id(veid); -+ err = -ESRCH; -+ if (!ve) -+ goto out; -+ -+ down_read(&ve->op_sem); -+ if (ve->is_running) -+ err = veip_entry_add(ve, &addr); -+ up_read(&ve->op_sem); -+ put_ve(ve); -+ break; -+ -+ case VE_IP_DEL: -+ err = veip_entry_del(veid, &addr); -+ break; -+ default: -+ err = -EINVAL; -+ } -+ -+out: -+ return err; -+} -+ -+int venet_ioctl(struct inode *ino, struct file *file, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err; -+ -+ err = -ENOTTY; -+ switch(cmd) { -+ case VENETCTL_VE_IP_MAP: { -+ struct vzctl_ve_ip_map s; -+ err = -EFAULT; -+ if (copy_from_user(&s, (void *)arg, sizeof(s))) -+ break; -+ err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen); -+ } -+ break; -+ } -+ return err; -+} -+ -+static struct vzioctlinfo venetcalls = { -+ type: VENETCTLTYPE, -+ func: venet_ioctl, -+ owner: THIS_MODULE, -+}; -+ -+int venet_dev_start(struct ve_struct *env) -+{ -+ struct net_device *dev_venet; -+ int err; -+ -+ dev_venet = alloc_netdev(0, "venet%d", venet_setup); -+ if (!dev_venet) -+ return -ENOMEM; -+ err = dev_alloc_name(dev_venet, dev_venet->name); -+ if (err<0) -+ goto err; -+ if ((err = register_netdev(dev_venet)) != 0) -+ goto err; -+ env->_venet_dev = dev_venet; -+ return 0; -+err: -+ free_netdev(dev_venet); -+ printk(KERN_ERR "VENET initialization error err=%d\n", err); -+ return err; -+} -+ -+static int venet_start(unsigned int hooknum, void *data) -+{ -+ struct ve_struct *env; -+ int err; -+ -+ env = (struct ve_struct *)data; -+ if (env->veip) -+ return -EEXIST; -+ if (!ve_is_super(env) && !try_module_get(THIS_MODULE)) -+ return 0; -+ -+ err = veip_start(env); -+ if (err) -+ goto err; -+ -+ err = venet_dev_start(env); -+ if (err) -+ goto err_free; -+ return 0; -+ -+err_free: -+ veip_stop(env); -+err: -+ if (!ve_is_super(env)) -+ module_put(THIS_MODULE); -+ return err; -+} -+ -+static int venet_stop(unsigned int hooknum, void *data) -+{ -+ struct ve_struct *env; -+ -+ env = (struct ve_struct *)data; -+ veip_stop(env); -+ if (!ve_is_super(env)) -+ module_put(THIS_MODULE); -+ return 0; -+} -+ -+#define VE_HOOK_PRI_NET 0 -+ -+static struct ve_hook venet_ve_hook_init = { -+ hook: venet_start, -+ undo: venet_stop, -+ hooknum: VE_HOOK_INIT, -+ priority: VE_HOOK_PRI_NET -+}; -+ -+static struct ve_hook venet_ve_hook_fini = { -+ hook: venet_stop, -+ hooknum: VE_HOOK_FINI, -+ priority: VE_HOOK_PRI_NET -+}; -+ -+__init int venet_init(void) -+{ -+#ifdef CONFIG_PROC_FS -+ struct proc_dir_entry *de; -+#endif -+ int i, err; -+ -+ if (get_ve0()->_venet_dev != NULL) -+ return -EEXIST; -+ -+ for (i = 0; i < VEIP_HASH_SZ; i++) -+ INIT_LIST_HEAD(ip_entry_hash_table + i); -+ -+ err = venet_start(VE_HOOK_INIT, (void *)get_ve0()); -+ if (err) -+ return err; -+ -+#ifdef CONFIG_PROC_FS -+ de = create_proc_glob_entry("vz/veinfo", -+ S_IFREG|S_IRUSR, NULL); -+ if (de) -+ de->proc_fops = &proc_veinfo_operations; -+ else -+ printk(KERN_WARNING "venet: can't make veinfo proc entry\n"); -+ -+ de = create_proc_entry("vz/veip", S_IFREG|S_IRUSR, NULL); -+ if (de) -+ de->proc_fops = &proc_veip_operations; -+ else -+ printk(KERN_WARNING "venet: can't make veip proc entry\n"); -+#endif -+ -+ ve_hook_register(&venet_ve_hook_init); -+ ve_hook_register(&venet_ve_hook_fini); -+ vzioctl_register(&venetcalls); -+ return 0; -+} -+ -+__exit void venet_exit(void) -+{ -+ struct net_device *dev_venet; -+ -+ vzioctl_unregister(&venetcalls); -+ ve_hook_unregister(&venet_ve_hook_fini); -+ ve_hook_unregister(&venet_ve_hook_init); -+#ifdef CONFIG_PROC_FS -+ remove_proc_entry("vz/veip", NULL); -+ remove_proc_entry("vz/veinfo", NULL); -+#endif -+ -+ dev_venet = get_ve0()->_venet_dev; -+ if (dev_venet != NULL) { -+ get_ve0()->_venet_dev = NULL; -+ unregister_netdev(dev_venet); -+ free_netdev(dev_venet); -+ } -+ veip_stop(get_ve0()); -+} -+ -+module_init(venet_init); -+module_exit(venet_exit); -diff -upr linux-2.6.16.orig/drivers/net/wireless/Kconfig linux-2.6.16-026test009/drivers/net/wireless/Kconfig ---- linux-2.6.16.orig/drivers/net/wireless/Kconfig 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/wireless/Kconfig 2006-04-19 15:02:11.000000000 +0400 -@@ -239,7 +239,8 @@ config IPW2200_DEBUG - - config AIRO - tristate "Cisco/Aironet 34X/35X/4500/4800 ISA and PCI cards" -- depends on NET_RADIO && ISA_DMA_API && CRYPTO && (PCI || BROKEN) -+ depends on NET_RADIO && ISA_DMA_API && (PCI || BROKEN) -+ select CRYPTO - ---help--- - This is the standard Linux driver to support Cisco/Aironet ISA and - PCI 802.11 wireless cards. -@@ -374,6 +375,7 @@ config PCMCIA_HERMES - config PCMCIA_SPECTRUM - tristate "Symbol Spectrum24 Trilogy PCMCIA card support" - depends on NET_RADIO && PCMCIA && HERMES -+ select FW_LOADER - ---help--- - - This is a driver for 802.11b cards using RAM-loadable Symbol -@@ -387,6 +389,7 @@ config PCMCIA_SPECTRUM - config AIRO_CS - tristate "Cisco/Aironet 34X/35X/4500/4800 PCMCIA cards" - depends on NET_RADIO && PCMCIA && (BROKEN || !M32R) -+ select CRYPTO - ---help--- - This is the standard Linux driver to support Cisco/Aironet PCMCIA - 802.11 wireless cards. This driver is the same as the Aironet -diff -upr linux-2.6.16.orig/drivers/net/wireless/hostap/hostap_80211_tx.c linux-2.6.16-026test009/drivers/net/wireless/hostap/hostap_80211_tx.c ---- linux-2.6.16.orig/drivers/net/wireless/hostap/hostap_80211_tx.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/wireless/hostap/hostap_80211_tx.c 2006-04-19 15:02:11.000000000 +0400 -@@ -469,7 +469,7 @@ int hostap_master_start_xmit(struct sk_b - } - - if (local->ieee_802_1x && meta->ethertype == ETH_P_PAE && tx.crypt && -- !(fc & IEEE80211_FCTL_VERS)) { -+ !(fc & IEEE80211_FCTL_PROTECTED)) { - no_encrypt = 1; - PDEBUG(DEBUG_EXTRA2, "%s: TX: IEEE 802.1X - passing " - "unencrypted EAPOL frame\n", dev->name); -diff -upr linux-2.6.16.orig/drivers/net/wireless/ipw2200.c linux-2.6.16-026test009/drivers/net/wireless/ipw2200.c ---- linux-2.6.16.orig/drivers/net/wireless/ipw2200.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/net/wireless/ipw2200.c 2006-04-19 15:02:11.000000000 +0400 -@@ -9956,9 +9956,8 @@ static int ipw_ethtool_set_eeprom(struct - return -EINVAL; - down(&p->sem); - memcpy(&p->eeprom[eeprom->offset], bytes, eeprom->len); -- for (i = IPW_EEPROM_DATA; -- i < IPW_EEPROM_DATA + IPW_EEPROM_IMAGE_SIZE; i++) -- ipw_write8(p, i, p->eeprom[i]); -+ for (i = 0; i < IPW_EEPROM_IMAGE_SIZE; i++) -+ ipw_write8(p, i + IPW_EEPROM_DATA, p->eeprom[i]); - up(&p->sem); - return 0; - } -diff -upr linux-2.6.16.orig/drivers/pci/probe.c linux-2.6.16-026test009/drivers/pci/probe.c ---- linux-2.6.16.orig/drivers/pci/probe.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/pci/probe.c 2006-04-19 15:02:12.000000000 +0400 -@@ -21,6 +21,7 @@ LIST_HEAD(pci_root_buses); - EXPORT_SYMBOL(pci_root_buses); - - LIST_HEAD(pci_devices); -+EXPORT_SYMBOL(pci_devices); - - #ifdef HAVE_PCI_LEGACY - /** -diff -upr linux-2.6.16.orig/drivers/pcmcia/ds.c linux-2.6.16-026test009/drivers/pcmcia/ds.c ---- linux-2.6.16.orig/drivers/pcmcia/ds.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/pcmcia/ds.c 2006-04-19 15:02:11.000000000 +0400 -@@ -546,7 +546,7 @@ static int pcmcia_device_query(struct pc - tmp = vers1->str + vers1->ofs[i]; - - length = strlen(tmp) + 1; -- if ((length < 3) || (length > 255)) -+ if ((length < 2) || (length > 255)) - continue; - - p_dev->prod_id[i] = kmalloc(sizeof(char) * length, -diff -upr linux-2.6.16.orig/drivers/s390/cio/cio.c linux-2.6.16-026test009/drivers/s390/cio/cio.c ---- linux-2.6.16.orig/drivers/s390/cio/cio.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/s390/cio/cio.c 2006-04-19 15:02:12.000000000 +0400 -@@ -610,7 +610,11 @@ do_IRQ (struct pt_regs *regs) - struct tpi_info *tpi_info; - struct subchannel *sch; - struct irb *irb; -+ struct ve_struct *ve; -+ struct user_beancounter *ub; - -+ ve = set_exec_env(get_ve0()); -+ ub = set_exec_ub(get_ub0()); - irq_enter (); - asm volatile ("mc 0,0"); - if (S390_lowcore.int_clock >= S390_lowcore.jiffy_timer) -@@ -657,6 +661,8 @@ do_IRQ (struct pt_regs *regs) - */ - } while (!MACHINE_IS_VM && tpi (NULL) != 0); - irq_exit (); -+ (void)set_exec_ub(ub); -+ (void)set_exec_env(ve); - } - - #ifdef CONFIG_CCW_CONSOLE -diff -upr linux-2.6.16.orig/drivers/scsi/sata_mv.c linux-2.6.16-026test009/drivers/scsi/sata_mv.c ---- linux-2.6.16.orig/drivers/scsi/sata_mv.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/scsi/sata_mv.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1102,6 +1102,7 @@ static u8 mv_get_crpb_status(struct ata_ - void __iomem *port_mmio = mv_ap_base(ap); - struct mv_port_priv *pp = ap->private_data; - u32 out_ptr; -+ u8 ata_status; - - out_ptr = readl(port_mmio + EDMA_RSP_Q_OUT_PTR_OFS); - -@@ -1109,6 +1110,8 @@ static u8 mv_get_crpb_status(struct ata_ - assert(((out_ptr >> EDMA_RSP_Q_PTR_SHIFT) & MV_MAX_Q_DEPTH_MASK) == - pp->rsp_consumer); - -+ ata_status = pp->crpb[pp->rsp_consumer].flags >> CRPB_FLAG_STATUS_SHIFT; -+ - /* increment our consumer index... */ - pp->rsp_consumer = mv_inc_q_index(&pp->rsp_consumer); - -@@ -1123,7 +1126,7 @@ static u8 mv_get_crpb_status(struct ata_ - writelfl(out_ptr, port_mmio + EDMA_RSP_Q_OUT_PTR_OFS); - - /* Return ATA status register for completed CRPB */ -- return (pp->crpb[pp->rsp_consumer].flags >> CRPB_FLAG_STATUS_SHIFT); -+ return ata_status; - } - - /** -@@ -1192,7 +1195,6 @@ static void mv_host_intr(struct ata_host - u32 hc_irq_cause; - int shift, port, port0, hard_port, handled; - unsigned int err_mask; -- u8 ata_status = 0; - - if (hc == 0) { - port0 = 0; -@@ -1210,6 +1212,7 @@ static void mv_host_intr(struct ata_host - hc,relevant,hc_irq_cause); - - for (port = port0; port < port0 + MV_PORTS_PER_HC; port++) { -+ u8 ata_status = 0; - ap = host_set->ports[port]; - hard_port = port & MV_PORT_MASK; /* range 0-3 */ - handled = 0; /* ensure ata_status is set if handled++ */ -diff -upr linux-2.6.16.orig/drivers/usb/core/message.c linux-2.6.16-026test009/drivers/usb/core/message.c ---- linux-2.6.16.orig/drivers/usb/core/message.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/usb/core/message.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1388,11 +1388,13 @@ free_interfaces: - if (dev->state != USB_STATE_ADDRESS) - usb_disable_device (dev, 1); // Skip ep0 - -- i = dev->bus_mA - cp->desc.bMaxPower * 2; -- if (i < 0) -- dev_warn(&dev->dev, "new config #%d exceeds power " -- "limit by %dmA\n", -- configuration, -i); -+ if (cp) { -+ i = dev->bus_mA - cp->desc.bMaxPower * 2; -+ if (i < 0) -+ dev_warn(&dev->dev, "new config #%d exceeds power " -+ "limit by %dmA\n", -+ configuration, -i); -+ } - - if ((ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), - USB_REQ_SET_CONFIGURATION, 0, configuration, 0, -diff -upr linux-2.6.16.orig/drivers/usb/host/ehci-sched.c linux-2.6.16-026test009/drivers/usb/host/ehci-sched.c ---- linux-2.6.16.orig/drivers/usb/host/ehci-sched.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/usb/host/ehci-sched.c 2006-04-19 15:02:11.000000000 +0400 -@@ -707,6 +707,7 @@ iso_stream_init ( - } else { - u32 addr; - int think_time; -+ int hs_transfers; - - addr = dev->ttport << 24; - if (!ehci_is_TDI(ehci) -@@ -719,6 +720,7 @@ iso_stream_init ( - think_time = dev->tt ? dev->tt->think_time : 0; - stream->tt_usecs = NS_TO_US (think_time + usb_calc_bus_time ( - dev->speed, is_input, 1, maxp)); -+ hs_transfers = max (1u, (maxp + 187) / 188); - if (is_input) { - u32 tmp; - -@@ -727,12 +729,11 @@ iso_stream_init ( - stream->usecs = HS_USECS_ISO (1); - stream->raw_mask = 1; - -- /* pessimistic c-mask */ -- tmp = usb_calc_bus_time (USB_SPEED_FULL, 1, 0, maxp) -- / (125 * 1000); -- stream->raw_mask |= 3 << (tmp + 9); -+ /* c-mask as specified in USB 2.0 11.18.4 3.c */ -+ tmp = (1 << (hs_transfers + 2)) - 1; -+ stream->raw_mask |= tmp << (8 + 2); - } else -- stream->raw_mask = smask_out [maxp / 188]; -+ stream->raw_mask = smask_out [hs_transfers - 1]; - bandwidth = stream->usecs + stream->c_usecs; - bandwidth /= 1 << (interval + 2); - -diff -upr linux-2.6.16.orig/drivers/usb/serial/console.c linux-2.6.16-026test009/drivers/usb/serial/console.c ---- linux-2.6.16.orig/drivers/usb/serial/console.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/usb/serial/console.c 2006-04-19 15:02:11.000000000 +0400 -@@ -54,7 +54,7 @@ static struct console usbcons; - * serial.c code, except that the specifier is "ttyUSB" instead - * of "ttyS". - */ --static int __init usb_console_setup(struct console *co, char *options) -+static int usb_console_setup(struct console *co, char *options) - { - struct usbcons_info *info = &usbcons_info; - int baud = 9600; -diff -upr linux-2.6.16.orig/drivers/usb/storage/Kconfig linux-2.6.16-026test009/drivers/usb/storage/Kconfig ---- linux-2.6.16.orig/drivers/usb/storage/Kconfig 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/usb/storage/Kconfig 2006-04-19 15:02:11.000000000 +0400 -@@ -48,7 +48,8 @@ config USB_STORAGE_FREECOM - - config USB_STORAGE_ISD200 - bool "ISD-200 USB/ATA Bridge support" -- depends on USB_STORAGE && BLK_DEV_IDE -+ depends on USB_STORAGE -+ depends on BLK_DEV_IDE=y || BLK_DEV_IDE=USB_STORAGE - ---help--- - Say Y here if you want to use USB Mass Store devices based - on the In-Systems Design ISD-200 USB/ATA bridge. -diff -upr linux-2.6.16.orig/drivers/video/cfbimgblt.c linux-2.6.16-026test009/drivers/video/cfbimgblt.c ---- linux-2.6.16.orig/drivers/video/cfbimgblt.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/video/cfbimgblt.c 2006-04-19 15:02:11.000000000 +0400 -@@ -169,7 +169,7 @@ static inline void slow_imageblit(const - - while (j--) { - l--; -- color = (*s & 1 << (FB_BIT_NR(l))) ? fgcolor : bgcolor; -+ color = (*s & (1 << l)) ? fgcolor : bgcolor; - val |= FB_SHIFT_HIGH(color, shift); - - /* Did the bitshift spill bits to the next long? */ -diff -upr linux-2.6.16.orig/drivers/video/i810/i810_main.c linux-2.6.16-026test009/drivers/video/i810/i810_main.c ---- linux-2.6.16.orig/drivers/video/i810/i810_main.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/drivers/video/i810/i810_main.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1508,7 +1508,7 @@ static int i810fb_cursor(struct fb_info - int size = ((cursor->image.width + 7) >> 3) * - cursor->image.height; - int i; -- u8 *data = kmalloc(64 * 8, GFP_KERNEL); -+ u8 *data = kmalloc(64 * 8, GFP_ATOMIC); - - if (data == NULL) - return -ENOMEM; -diff -upr linux-2.6.16.orig/fs/9p/vfs_inode.c linux-2.6.16-026test009/fs/9p/vfs_inode.c ---- linux-2.6.16.orig/fs/9p/vfs_inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/9p/vfs_inode.c 2006-04-19 15:02:11.000000000 +0400 -@@ -614,6 +614,7 @@ static struct dentry *v9fs_vfs_lookup(st - - sb = dir->i_sb; - v9ses = v9fs_inode2v9ses(dir); -+ dentry->d_op = &v9fs_dentry_operations; - dirfid = v9fs_fid_lookup(dentry->d_parent); - - if (!dirfid) { -@@ -681,8 +682,6 @@ static struct dentry *v9fs_vfs_lookup(st - goto FreeFcall; - - fid->qid = fcall->params.rstat.stat.qid; -- -- dentry->d_op = &v9fs_dentry_operations; - v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb); - - d_add(dentry, inode); -diff -upr linux-2.6.16.orig/fs/Kconfig linux-2.6.16-026test009/fs/Kconfig ---- linux-2.6.16.orig/fs/Kconfig 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/Kconfig 2006-04-19 15:02:12.000000000 +0400 -@@ -433,6 +433,38 @@ config QFMT_V2 - This quota format allows using quotas with 32-bit UIDs/GIDs. If you - need this functionality say Y here. - -+config SIM_FS -+ tristate "VPS filesystem" -+ depends on VZ_QUOTA -+ default m -+ help -+ This file system is a part of Virtuozzo. It intoduces a fake -+ superblock and blockdev to VE to hide real device and show -+ statfs results taken from quota. -+ -+config VZ_QUOTA -+ tristate "Virtuozzo Disk Quota support" -+ depends on QUOTA -+ default m -+ help -+ Virtuozzo Disk Quota imposes disk quota on directories with their -+ files and subdirectories in total. Such disk quota is used to -+ account and limit disk usage by Virtuozzo VPS, but also may be used -+ separately. -+ -+config VZ_QUOTA_UNLOAD -+ bool "Unloadable Virtuozzo Disk Quota module" -+ depends on VZ_QUOTA=m -+ default n -+ help -+ Make Virtuozzo Disk Quota module unloadable. -+ Doesn't work reliably now. -+ -+config VZ_QUOTA_UGID -+ bool "Per-user and per-group quota in Virtuozzo quota partitions" -+ depends on VZ_QUOTA!=n -+ default y -+ - config QUOTACTL - bool - depends on XFS_QUOTA || QUOTA -diff -upr linux-2.6.16.orig/fs/Makefile linux-2.6.16-026test009/fs/Makefile ---- linux-2.6.16.orig/fs/Makefile 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/Makefile 2006-04-19 15:02:12.000000000 +0400 -@@ -39,9 +39,15 @@ obj-$(CONFIG_QUOTA) += dquot.o - obj-$(CONFIG_QFMT_V1) += quota_v1.o - obj-$(CONFIG_QFMT_V2) += quota_v2.o - obj-$(CONFIG_QUOTACTL) += quota.o -+obj-$(CONFIG_VZ_QUOTA) += vzdquota.o -+vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o -+vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o -+vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o - - obj-$(CONFIG_DNOTIFY) += dnotify.o - -+obj-$(CONFIG_SIM_FS) += simfs.o -+ - obj-$(CONFIG_PROC_FS) += proc/ - obj-y += partitions/ - obj-$(CONFIG_SYSFS) += sysfs/ -diff -upr linux-2.6.16.orig/fs/aio.c linux-2.6.16-026test009/fs/aio.c ---- linux-2.6.16.orig/fs/aio.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/aio.c 2006-04-19 15:02:12.000000000 +0400 -@@ -41,13 +41,16 @@ - #endif - - /*------ sysctl variables----*/ --static DEFINE_SPINLOCK(aio_nr_lock); -+DEFINE_SPINLOCK(aio_nr_lock); - unsigned long aio_nr; /* current system wide number of aio requests */ - unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ -+EXPORT_SYMBOL_GPL(aio_nr_lock); -+EXPORT_SYMBOL_GPL(aio_nr); - /*----end sysctl variables---*/ - - static kmem_cache_t *kiocb_cachep; --static kmem_cache_t *kioctx_cachep; -+kmem_cache_t *kioctx_cachep; -+EXPORT_SYMBOL_GPL(kioctx_cachep); - - static struct workqueue_struct *aio_wq; - -@@ -58,7 +61,7 @@ static DECLARE_WORK(fput_work, aio_fput_ - static DEFINE_SPINLOCK(fput_lock); - static LIST_HEAD(fput_head); - --static void aio_kick_handler(void *); -+void aio_kick_handler(void *); - static void aio_queue_work(struct kioctx *); - - /* aio_setup -@@ -293,7 +296,7 @@ static void aio_cancel_all(struct kioctx - spin_unlock_irq(&ctx->ctx_lock); - } - --static void wait_for_all_aios(struct kioctx *ctx) -+void wait_for_all_aios(struct kioctx *ctx) - { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); -@@ -310,6 +313,7 @@ static void wait_for_all_aios(struct kio - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); - } -+EXPORT_SYMBOL_GPL(wait_for_all_aios); - - /* wait_on_sync_kiocb: - * Waits on the given sync kiocb to complete. -@@ -856,7 +860,7 @@ static inline void aio_run_all_iocbs(str - * space. - * Run on aiod's context. - */ --static void aio_kick_handler(void *data) -+void aio_kick_handler(void *data) - { - struct kioctx *ctx = data; - mm_segment_t oldfs = get_fs(); -@@ -875,6 +879,7 @@ static void aio_kick_handler(void *data) - if (requeue) - queue_work(aio_wq, &ctx->wq); - } -+EXPORT_SYMBOL_GPL(aio_kick_handler); - - - /* -diff -upr linux-2.6.16.orig/fs/autofs/autofs_i.h linux-2.6.16-026test009/fs/autofs/autofs_i.h ---- linux-2.6.16.orig/fs/autofs/autofs_i.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/autofs/autofs_i.h 2006-04-19 15:02:12.000000000 +0400 -@@ -124,7 +124,7 @@ static inline struct autofs_sb_info *aut - filesystem without "magic".) */ - - static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { -- return sbi->catatonic || process_group(current) == sbi->oz_pgrp; -+ return sbi->catatonic || virt_pgid(current) == sbi->oz_pgrp; - } - - /* Hash operations */ -diff -upr linux-2.6.16.orig/fs/autofs/init.c linux-2.6.16-026test009/fs/autofs/init.c ---- linux-2.6.16.orig/fs/autofs/init.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/autofs/init.c 2006-04-19 15:02:12.000000000 +0400 -@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs - .name = "autofs", - .get_sb = autofs_get_sb, - .kill_sb = kill_anon_super, -+ .fs_flags = FS_VIRTUALIZED, - }; - - static int __init init_autofs_fs(void) -diff -upr linux-2.6.16.orig/fs/autofs/inode.c linux-2.6.16-026test009/fs/autofs/inode.c ---- linux-2.6.16.orig/fs/autofs/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/autofs/inode.c 2006-04-19 15:02:12.000000000 +0400 -@@ -66,7 +66,7 @@ static int parse_options(char *options, - - *uid = current->uid; - *gid = current->gid; -- *pgrp = process_group(current); -+ *pgrp = virt_pgid(current); - - *minproto = *maxproto = AUTOFS_PROTO_VERSION; - -@@ -138,7 +138,7 @@ int autofs_fill_super(struct super_block - sbi->magic = AUTOFS_SBI_MAGIC; - sbi->catatonic = 0; - sbi->exp_timeout = 0; -- sbi->oz_pgrp = process_group(current); -+ sbi->oz_pgrp = virt_pgid(current); - autofs_initialize_hash(&sbi->dirhash); - sbi->queues = NULL; - memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN); -diff -upr linux-2.6.16.orig/fs/autofs/root.c linux-2.6.16-026test009/fs/autofs/root.c ---- linux-2.6.16.orig/fs/autofs/root.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/autofs/root.c 2006-04-19 15:02:12.000000000 +0400 -@@ -354,7 +354,7 @@ static int autofs_root_unlink(struct ino - - /* This allows root to remove symlinks */ - lock_kernel(); -- if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) { -+ if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) { - unlock_kernel(); - return -EACCES; - } -@@ -541,7 +541,7 @@ static int autofs_root_ioctl(struct inod - _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) - return -ENOTTY; - -- if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) -+ if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) - return -EPERM; - - switch(cmd) { -diff -upr linux-2.6.16.orig/fs/autofs4/autofs_i.h linux-2.6.16-026test009/fs/autofs4/autofs_i.h ---- linux-2.6.16.orig/fs/autofs4/autofs_i.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/autofs4/autofs_i.h 2006-04-19 15:02:12.000000000 +0400 -@@ -122,7 +122,7 @@ static inline struct autofs_info *autofs - filesystem without "magic".) */ - - static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { -- return sbi->catatonic || process_group(current) == sbi->oz_pgrp; -+ return sbi->catatonic || virt_pgid(current) == sbi->oz_pgrp; - } - - /* Does a dentry have some pending activity? */ -diff -upr linux-2.6.16.orig/fs/autofs4/init.c linux-2.6.16-026test009/fs/autofs4/init.c ---- linux-2.6.16.orig/fs/autofs4/init.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/autofs4/init.c 2006-04-19 15:02:12.000000000 +0400 -@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs - .name = "autofs", - .get_sb = autofs_get_sb, - .kill_sb = kill_anon_super, -+ .fs_flags = FS_VIRTUALIZED, - }; - - static int __init init_autofs4_fs(void) -diff -upr linux-2.6.16.orig/fs/autofs4/inode.c linux-2.6.16-026test009/fs/autofs4/inode.c ---- linux-2.6.16.orig/fs/autofs4/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/autofs4/inode.c 2006-04-19 15:02:12.000000000 +0400 -@@ -179,7 +179,7 @@ static int parse_options(char *options, - - *uid = current->uid; - *gid = current->gid; -- *pgrp = process_group(current); -+ *pgrp = virt_pgid(current); - - *minproto = AUTOFS_MIN_PROTO_VERSION; - *maxproto = AUTOFS_MAX_PROTO_VERSION; -@@ -265,7 +265,7 @@ int autofs4_fill_super(struct super_bloc - sbi->root = NULL; - sbi->catatonic = 0; - sbi->exp_timeout = 0; -- sbi->oz_pgrp = process_group(current); -+ sbi->oz_pgrp = virt_pgid(current); - sbi->sb = s; - sbi->version = 0; - sbi->sub_version = 0; -diff -upr linux-2.6.16.orig/fs/autofs4/root.c linux-2.6.16-026test009/fs/autofs4/root.c ---- linux-2.6.16.orig/fs/autofs4/root.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/autofs4/root.c 2006-04-19 15:02:12.000000000 +0400 -@@ -592,7 +592,7 @@ static int autofs4_dir_unlink(struct ino - struct autofs_info *ino = autofs4_dentry_ino(dentry); - - /* This allows root to remove symlinks */ -- if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) -+ if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) - return -EACCES; - - dput(ino->dentry); -@@ -784,7 +784,7 @@ static int autofs4_root_ioctl(struct ino - _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) - return -ENOTTY; - -- if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) -+ if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) - return -EPERM; - - switch(cmd) { -diff -upr linux-2.6.16.orig/fs/binfmt_aout.c linux-2.6.16-026test009/fs/binfmt_aout.c ---- linux-2.6.16.orig/fs/binfmt_aout.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/binfmt_aout.c 2006-04-19 15:02:12.000000000 +0400 -@@ -446,9 +446,11 @@ beyond_if: - #endif - start_thread(regs, ex.a_entry, current->mm->start_stack); - if (unlikely(current->ptrace & PT_PTRACED)) { -- if (current->ptrace & PT_TRACE_EXEC) -+ if (current->ptrace & PT_TRACE_EXEC) { -+ set_pn_state(current, PN_STOP_EXEC); - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); -- else -+ clear_pn_state(current); -+ } else - send_sig(SIGTRAP, current, 0); - } - return 0; -diff -upr linux-2.6.16.orig/fs/binfmt_elf.c linux-2.6.16-026test009/fs/binfmt_elf.c ---- linux-2.6.16.orig/fs/binfmt_elf.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/binfmt_elf.c 2006-04-19 15:02:12.000000000 +0400 -@@ -361,7 +361,7 @@ static unsigned long load_elf_interp(str - eppnt = elf_phdata; - for (i=0; i<interp_elf_ex->e_phnum; i++, eppnt++) { - if (eppnt->p_type == PT_LOAD) { -- int elf_type = MAP_PRIVATE | MAP_DENYWRITE; -+ int elf_type = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECPRIO; - int elf_prot = 0; - unsigned long vaddr = 0; - unsigned long k, map_addr; -@@ -669,7 +669,7 @@ static int load_elf_binary(struct linux_ - */ - SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); - -- interpreter = open_exec(elf_interpreter); -+ interpreter = open_exec(elf_interpreter, NULL); - retval = PTR_ERR(interpreter); - if (IS_ERR(interpreter)) - goto out_free_interp; -@@ -834,7 +834,7 @@ static int load_elf_binary(struct linux_ - if (elf_ppnt->p_flags & PF_W) elf_prot |= PROT_WRITE; - if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; - -- elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE; -+ elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE|MAP_EXECPRIO; - - vaddr = elf_ppnt->p_vaddr; - if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { -@@ -1000,9 +1000,11 @@ static int load_elf_binary(struct linux_ - - start_thread(regs, elf_entry, bprm->p); - if (unlikely(current->ptrace & PT_PTRACED)) { -- if (current->ptrace & PT_TRACE_EXEC) -+ if (current->ptrace & PT_TRACE_EXEC) { -+ set_pn_state(current, PN_STOP_EXEC); - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); -- else -+ clear_pn_state(current); -+ } else - send_sig(SIGTRAP, current, 0); - } - retval = 0; -@@ -1022,8 +1024,13 @@ out_free_file: - sys_close(elf_exec_fileno); - out_free_fh: - if (files) { -- put_files_struct(current->files); -+ struct files_struct *old; -+ -+ old = current->files; -+ task_lock(current); - current->files = files; -+ task_unlock(current); -+ put_files_struct(old); - } - out_free_ph: - kfree(elf_phdata); -@@ -1281,10 +1288,10 @@ static void fill_prstatus(struct elf_prs - prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; - prstatus->pr_sigpend = p->pending.signal.sig[0]; - prstatus->pr_sighold = p->blocked.sig[0]; -- prstatus->pr_pid = p->pid; -- prstatus->pr_ppid = p->parent->pid; -- prstatus->pr_pgrp = process_group(p); -- prstatus->pr_sid = p->signal->session; -+ prstatus->pr_pid = virt_pid(p); -+ prstatus->pr_ppid = virt_pid(p->parent); -+ prstatus->pr_pgrp = virt_pgid(p); -+ prstatus->pr_sid = virt_sid(p); - if (thread_group_leader(p)) { - /* - * This is the record for the group leader. Add in the -@@ -1327,10 +1334,10 @@ static int fill_psinfo(struct elf_prpsin - psinfo->pr_psargs[i] = ' '; - psinfo->pr_psargs[len] = 0; - -- psinfo->pr_pid = p->pid; -- psinfo->pr_ppid = p->parent->pid; -- psinfo->pr_pgrp = process_group(p); -- psinfo->pr_sid = p->signal->session; -+ psinfo->pr_pid = virt_pid(p); -+ psinfo->pr_ppid = virt_pid(p->parent); -+ psinfo->pr_pgrp = virt_pgid(p); -+ psinfo->pr_sid = virt_sid(p); - - i = p->state ? ffz(~p->state) + 1 : 0; - psinfo->pr_state = i; -@@ -1463,7 +1470,7 @@ static int elf_core_dump(long signr, str - if (signr) { - struct elf_thread_status *tmp; - read_lock(&tasklist_lock); -- do_each_thread(g,p) -+ do_each_thread_ve(g,p) - if (current->mm == p->mm && current != p) { - tmp = kmalloc(sizeof(*tmp), GFP_ATOMIC); - if (!tmp) { -@@ -1475,7 +1482,7 @@ static int elf_core_dump(long signr, str - tmp->thread = p; - list_add(&tmp->list, &thread_list); - } -- while_each_thread(g,p); -+ while_each_thread_ve(g,p); - read_unlock(&tasklist_lock); - list_for_each(t, &thread_list) { - struct elf_thread_status *tmp; -diff -upr linux-2.6.16.orig/fs/binfmt_elf_fdpic.c linux-2.6.16-026test009/fs/binfmt_elf_fdpic.c ---- linux-2.6.16.orig/fs/binfmt_elf_fdpic.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/binfmt_elf_fdpic.c 2006-04-19 15:02:11.000000000 +0400 -@@ -205,7 +205,7 @@ static int load_elf_fdpic_binary(struct - kdebug("Using ELF interpreter %s", interpreter_name); - - /* replace the program with the interpreter */ -- interpreter = open_exec(interpreter_name); -+ interpreter = open_exec(interpreter_name, bprm); - retval = PTR_ERR(interpreter); - if (IS_ERR(interpreter)) { - interpreter = NULL; -diff -upr linux-2.6.16.orig/fs/binfmt_em86.c linux-2.6.16-026test009/fs/binfmt_em86.c ---- linux-2.6.16.orig/fs/binfmt_em86.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/binfmt_em86.c 2006-04-19 15:02:11.000000000 +0400 -@@ -82,7 +82,7 @@ static int load_em86(struct linux_binprm - * Note that we use open_exec() as the name is now in kernel - * space, and we don't need to copy it. - */ -- file = open_exec(interp); -+ file = open_exec(interp, bprm); - if (IS_ERR(file)) - return PTR_ERR(file); - -diff -upr linux-2.6.16.orig/fs/binfmt_flat.c linux-2.6.16-026test009/fs/binfmt_flat.c ---- linux-2.6.16.orig/fs/binfmt_flat.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/binfmt_flat.c 2006-04-19 15:02:11.000000000 +0400 -@@ -774,7 +774,7 @@ static int load_flat_shared_library(int - - /* Open the file up */ - bprm.filename = buf; -- bprm.file = open_exec(bprm.filename); -+ bprm.file = open_exec(bprm.filename, bprm); - res = PTR_ERR(bprm.file); - if (IS_ERR(bprm.file)) - return res; -diff -upr linux-2.6.16.orig/fs/binfmt_misc.c linux-2.6.16-026test009/fs/binfmt_misc.c ---- linux-2.6.16.orig/fs/binfmt_misc.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/binfmt_misc.c 2006-04-19 15:02:11.000000000 +0400 -@@ -179,7 +179,7 @@ static int load_misc_binary(struct linux - - bprm->interp = iname; /* for binfmt_script */ - -- interp_file = open_exec (iname); -+ interp_file = open_exec (iname, bprm); - retval = PTR_ERR (interp_file); - if (IS_ERR (interp_file)) - goto _error; -@@ -216,8 +216,13 @@ _error: - bprm->interp_data = 0; - _unshare: - if (files) { -- put_files_struct(current->files); -+ struct files_struct *old; -+ -+ old = current->files; -+ task_lock(current); - current->files = files; -+ task_unlock(current); -+ put_files_struct(old); - } - goto _ret; - } -diff -upr linux-2.6.16.orig/fs/binfmt_script.c linux-2.6.16-026test009/fs/binfmt_script.c ---- linux-2.6.16.orig/fs/binfmt_script.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/binfmt_script.c 2006-04-19 15:02:11.000000000 +0400 -@@ -85,7 +85,7 @@ static int load_script(struct linux_binp - /* - * OK, now restart the process with the interpreter's dentry. - */ -- file = open_exec(interp); -+ file = open_exec(interp, bprm); - if (IS_ERR(file)) - return PTR_ERR(file); - -diff -upr linux-2.6.16.orig/fs/block_dev.c linux-2.6.16-026test009/fs/block_dev.c ---- linux-2.6.16.orig/fs/block_dev.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/block_dev.c 2006-04-19 15:02:12.000000000 +0400 -@@ -561,9 +561,16 @@ static int do_open(struct block_device * - { - struct module *owner = NULL; - struct gendisk *disk; -- int ret = -ENXIO; -+ int ret; - int part; - -+#ifdef CONFIG_VE -+ ret = get_device_perms_ve(S_IFBLK, bdev->bd_dev, -+ file->f_mode&(FMODE_READ|FMODE_WRITE)); -+ if (ret) -+ return ret; -+#endif -+ ret = -ENXIO; - file->f_mapping = bdev->bd_inode->i_mapping; - lock_kernel(); - disk = get_gendisk(bdev->bd_dev, &part); -@@ -832,7 +839,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); - * namespace if possible and return it. Return ERR_PTR(error) - * otherwise. - */ --struct block_device *lookup_bdev(const char *path) -+struct block_device *lookup_bdev(const char *path, int mode) - { - struct block_device *bdev; - struct inode *inode; -@@ -850,6 +857,11 @@ struct block_device *lookup_bdev(const c - error = -ENOTBLK; - if (!S_ISBLK(inode->i_mode)) - goto fail; -+#ifdef CONFIG_VE -+ error = get_device_perms_ve(S_IFBLK, inode->i_rdev, mode); -+ if (error) -+ goto fail; -+#endif - error = -EACCES; - if (nd.mnt->mnt_flags & MNT_NODEV) - goto fail; -@@ -881,12 +893,13 @@ struct block_device *open_bdev_excl(cons - mode_t mode = FMODE_READ; - int error = 0; - -- bdev = lookup_bdev(path); -+ if (!(flags & MS_RDONLY)) -+ mode |= FMODE_WRITE; -+ -+ bdev = lookup_bdev(path, mode); - if (IS_ERR(bdev)) - return bdev; - -- if (!(flags & MS_RDONLY)) -- mode |= FMODE_WRITE; - error = blkdev_get(bdev, mode, 0); - if (error) - return ERR_PTR(error); -diff -upr linux-2.6.16.orig/fs/char_dev.c linux-2.6.16-026test009/fs/char_dev.c ---- linux-2.6.16.orig/fs/char_dev.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/char_dev.c 2006-04-19 15:02:12.000000000 +0400 -@@ -342,6 +342,13 @@ int chrdev_open(struct inode * inode, st - struct cdev *new = NULL; - int ret = 0; - -+#ifdef CONFIG_VE -+ ret = get_device_perms_ve(S_IFCHR, inode->i_rdev, -+ filp->f_mode&(FMODE_READ|FMODE_WRITE)); -+ if (ret) -+ return ret; -+#endif -+ - spin_lock(&cdev_lock); - p = inode->i_cdev; - if (!p) { -diff -upr linux-2.6.16.orig/fs/cifs/cifsencrypt.c linux-2.6.16-026test009/fs/cifs/cifsencrypt.c ---- linux-2.6.16.orig/fs/cifs/cifsencrypt.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/cifs/cifsencrypt.c 2006-04-19 15:02:11.000000000 +0400 -@@ -56,9 +56,6 @@ int cifs_sign_smb(struct smb_hdr * cifs_ - int rc = 0; - char smb_signature[20]; - -- /* BB remember to initialize sequence number elsewhere and initialize mac_signing key elsewhere BB */ -- /* BB remember to add code to save expected sequence number in midQ entry BB */ -- - if((cifs_pdu == NULL) || (server == NULL)) - return -EINVAL; - -@@ -85,20 +82,33 @@ int cifs_sign_smb(struct smb_hdr * cifs_ - static int cifs_calc_signature2(const struct kvec * iov, int n_vec, - const char * key, char * signature) - { -- struct MD5Context context; -- -- if((iov == NULL) || (signature == NULL)) -- return -EINVAL; -+ struct MD5Context context; -+ int i; - -- MD5Init(&context); -- MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16); -+ if((iov == NULL) || (signature == NULL)) -+ return -EINVAL; - --/* MD5Update(&context,cifs_pdu->Protocol,cifs_pdu->smb_buf_length); */ /* BB FIXME BB */ -+ MD5Init(&context); -+ MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16); -+ for(i=0;i<n_vec;i++) { -+ if(iov[i].iov_base == NULL) { -+ cERROR(1,("null iovec entry")); -+ return -EIO; -+ } else if(iov[i].iov_len == 0) -+ break; /* bail out if we are sent nothing to sign */ -+ /* The first entry includes a length field (which does not get -+ signed that occupies the first 4 bytes before the header */ -+ if(i==0) { -+ if (iov[0].iov_len <= 8 ) /* cmd field at offset 9 */ -+ break; /* nothing to sign or corrupt header */ -+ MD5Update(&context,iov[0].iov_base+4, iov[0].iov_len-4); -+ } else -+ MD5Update(&context,iov[i].iov_base, iov[i].iov_len); -+ } - -- MD5Final(signature,&context); -+ MD5Final(signature,&context); - -- return -EOPNOTSUPP; --/* return 0; */ -+ return 0; - } - - -diff -upr linux-2.6.16.orig/fs/cifs/cifsfs.c linux-2.6.16-026test009/fs/cifs/cifsfs.c ---- linux-2.6.16.orig/fs/cifs/cifsfs.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/cifs/cifsfs.c 2006-04-19 15:02:11.000000000 +0400 -@@ -220,7 +220,8 @@ cifs_statfs(struct super_block *sb, stru - longer available? */ - } - --static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd) -+static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - struct cifs_sb_info *cifs_sb; - -@@ -232,7 +233,7 @@ static int cifs_permission(struct inode - on the client (above and beyond ACL on servers) for - servers which do not support setting and viewing mode bits, - so allowing client to check permissions is useful */ -- return generic_permission(inode, mask, NULL); -+ return generic_permission(inode, mask, NULL, perm); - } - - static kmem_cache_t *cifs_inode_cachep; -diff -upr linux-2.6.16.orig/fs/coda/dir.c linux-2.6.16-026test009/fs/coda/dir.c ---- linux-2.6.16.orig/fs/coda/dir.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/coda/dir.c 2006-04-19 15:02:11.000000000 +0400 -@@ -151,7 +151,8 @@ exit: - } - - --int coda_permission(struct inode *inode, int mask, struct nameidata *nd) -+int coda_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - int error = 0; - -diff -upr linux-2.6.16.orig/fs/coda/pioctl.c linux-2.6.16-026test009/fs/coda/pioctl.c ---- linux-2.6.16.orig/fs/coda/pioctl.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/coda/pioctl.c 2006-04-19 15:02:11.000000000 +0400 -@@ -25,7 +25,7 @@ - - /* pioctl ops */ - static int coda_ioctl_permission(struct inode *inode, int mask, -- struct nameidata *nd); -+ struct nameidata *nd, struct exec_perm *perm); - static int coda_pioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long user_data); - -@@ -43,7 +43,7 @@ struct file_operations coda_ioctl_operat - - /* the coda pioctl inode ops */ - static int coda_ioctl_permission(struct inode *inode, int mask, -- struct nameidata *nd) -+ struct nameidata *nd, struct exec_perm *perm) - { - return 0; - } -diff -upr linux-2.6.16.orig/fs/compat.c linux-2.6.16-026test009/fs/compat.c ---- linux-2.6.16.orig/fs/compat.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/compat.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1479,7 +1479,7 @@ int compat_do_execve(char * filename, - goto out_ret; - memset(bprm, 0, sizeof(*bprm)); - -- file = open_exec(filename); -+ file = open_exec(filename, bprm); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_kfree; -diff -upr linux-2.6.16.orig/fs/dcache.c linux-2.6.16-026test009/fs/dcache.c ---- linux-2.6.16.orig/fs/dcache.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/dcache.c 2006-04-19 15:02:12.000000000 +0400 -@@ -28,11 +28,16 @@ - #include <linux/module.h> - #include <linux/mount.h> - #include <linux/file.h> -+#include <linux/namei.h> - #include <asm/uaccess.h> - #include <linux/security.h> - #include <linux/seqlock.h> - #include <linux/swap.h> - #include <linux/bootmem.h> -+#include <linux/kernel_stat.h> -+#include <net/inet_sock.h> -+ -+#include <ub/ub_dcache.h> - - /* #define DCACHE_DEBUG 1 */ - -@@ -44,7 +49,7 @@ static seqlock_t rename_lock __cacheline - - EXPORT_SYMBOL(dcache_lock); - --static kmem_cache_t *dentry_cache; -+kmem_cache_t *dentry_cache; - - #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) - -@@ -114,6 +119,75 @@ static void dentry_iput(struct dentry * - } - } - -+struct dcache_shrinker { -+ struct list_head list; -+ struct dentry *dentry; -+}; -+ -+DECLARE_WAIT_QUEUE_HEAD(dcache_shrinker_wq); -+ -+/* called under dcache_lock */ -+static void dcache_shrinker_add(struct dcache_shrinker *ds, -+ struct dentry *parent, struct dentry *dentry) -+{ -+ struct super_block *sb; -+ -+ sb = parent->d_sb; -+ ds->dentry = parent; -+ list_add(&ds->list, &sb->s_dshrinkers); -+} -+ -+/* called under dcache_lock */ -+static void dcache_shrinker_del(struct dcache_shrinker *ds) -+{ -+ if (ds == NULL || list_empty(&ds->list)) -+ return; -+ -+ list_del_init(&ds->list); -+ wake_up_all(&dcache_shrinker_wq); -+} -+ -+/* called under dcache_lock, drops inside */ -+static void dcache_shrinker_wait(struct super_block *sb) -+{ -+ DECLARE_WAITQUEUE(wq, current); -+ -+ __set_current_state(TASK_UNINTERRUPTIBLE); -+ add_wait_queue(&dcache_shrinker_wq, &wq); -+ spin_unlock(&dcache_lock); -+ -+ schedule(); -+ remove_wait_queue(&dcache_shrinker_wq, &wq); -+ __set_current_state(TASK_RUNNING); -+} -+ -+void dcache_shrinker_wait_sb(struct super_block *sb) -+{ -+ /* the root dentry can be held in dput_recursive */ -+ spin_lock(&dcache_lock); -+ while (!list_empty(&sb->s_dshrinkers)) { -+ dcache_shrinker_wait(sb); -+ spin_lock(&dcache_lock); -+ } -+ spin_unlock(&dcache_lock); -+} -+ -+/* dcache_lock protects shrinker's list */ -+static void shrink_dcache_racecheck(struct dentry *parent, int *racecheck) -+{ -+ struct super_block *sb; -+ struct dcache_shrinker *ds; -+ -+ sb = parent->d_sb; -+ list_for_each_entry(ds, &sb->s_dshrinkers, list) { -+ /* is one of dcache shrinkers working on the dentry? */ -+ if (ds->dentry == parent) { -+ *racecheck = 1; -+ break; -+ } -+ } -+} -+ - /* - * This is dput - * -@@ -132,8 +206,9 @@ static void dentry_iput(struct dentry * - */ - - /* -- * dput - release a dentry -- * @dentry: dentry to release -+ * dput_recursive - go upward through the dentry tree and release dentries -+ * @dentry: starting dentry -+ * @ds: shrinker to be added to active list (see shrink_dcache_parent) - * - * Release a dentry. This will drop the usage count and if appropriate - * call the dentry unlink method as well as removing it from the queues and -@@ -142,18 +217,15 @@ static void dentry_iput(struct dentry * - * - * no dcache lock, please. - */ -- --void dput(struct dentry *dentry) -+static void dput_recursive(struct dentry *dentry, struct dcache_shrinker *ds) - { -- if (!dentry) -- return; -- --repeat: - if (atomic_read(&dentry->d_count) == 1) - might_sleep(); - if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) - return; -+ dcache_shrinker_del(ds); - -+repeat: - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); -@@ -185,6 +257,7 @@ unhash_it: - - kill_it: { - struct dentry *parent; -+ struct dcache_shrinker lds; - - /* If dentry was on d_lru list - * delete it from there -@@ -194,18 +267,50 @@ kill_it: { - dentry_stat.nr_unused--; - } - list_del(&dentry->d_u.d_child); -+ parent = dentry->d_parent; -+ dcache_shrinker_add(&lds, parent, dentry); - dentry_stat.nr_dentry--; /* For d_free, below */ - /*drops the locks, at that point nobody can reach this dentry */ - dentry_iput(dentry); -- parent = dentry->d_parent; - d_free(dentry); -- if (dentry == parent) -+ if (unlikely(dentry == parent)) { -+ spin_lock(&dcache_lock); -+ dcache_shrinker_del(&lds); -+ spin_unlock(&dcache_lock); - return; -+ } - dentry = parent; -- goto repeat; -+ spin_lock(&dcache_lock); -+ dcache_shrinker_del(&lds); -+ if (atomic_dec_and_test(&dentry->d_count)) -+ goto repeat; -+ spin_unlock(&dcache_lock); - } - } - -+/* -+ * dput - release a dentry -+ * @dentry: dentry to release -+ * -+ * Release a dentry. This will drop the usage count and if appropriate -+ * call the dentry unlink method as well as removing it from the queues and -+ * releasing its resources. If the parent dentries were scheduled for release -+ * they too may now get deleted. -+ * -+ * no dcache lock, please. -+ */ -+ -+void dput(struct dentry *dentry) -+{ -+ if (!dentry) -+ return; -+ -+ spin_lock(&dcache_lock); -+ ub_dentry_uncharge(dentry); -+ spin_unlock(&dcache_lock); -+ dput_recursive(dentry, NULL); -+} -+ - /** - * d_invalidate - invalidate a dentry - * @dentry: dentry to invalidate -@@ -272,6 +377,8 @@ static inline struct dentry * __dget_loc - dentry_stat.nr_unused--; - list_del_init(&dentry->d_lru); - } -+ -+ ub_dentry_charge_nofail(dentry); - return dentry; - } - -@@ -362,19 +469,27 @@ restart: - * removed. - * Called with dcache_lock, drops it and then regains. - */ --static inline void prune_one_dentry(struct dentry * dentry) -+static void prune_one_dentry(struct dentry * dentry) - { - struct dentry * parent; -+ struct dcache_shrinker ds; - - __d_drop(dentry); - list_del(&dentry->d_u.d_child); -+ parent = dentry->d_parent; -+ dcache_shrinker_add(&ds, parent, dentry); - dentry_stat.nr_dentry--; /* For d_free, below */ - dentry_iput(dentry); - parent = dentry->d_parent; - d_free(dentry); - if (parent != dentry) -- dput(parent); -+ /* -+ * dentry is not in use, only child (not outside) -+ * references change, so parent->d_inuse does not change -+ */ -+ dput_recursive(parent, &ds); - spin_lock(&dcache_lock); -+ dcache_shrinker_del(&ds); - } - - /** -@@ -486,6 +601,7 @@ repeat: - continue; - } - prune_one_dentry(dentry); -+ cond_resched_lock(&dcache_lock); - goto repeat; - } - spin_unlock(&dcache_lock); -@@ -557,13 +673,12 @@ positive: - * drop the lock and return early due to latency - * constraints. - */ --static int select_parent(struct dentry * parent) -+static int select_parent(struct dentry * parent, int * racecheck) - { - struct dentry *this_parent = parent; - struct list_head *next; - int found = 0; - -- spin_lock(&dcache_lock); - repeat: - next = this_parent->d_subdirs.next; - resume: -@@ -605,6 +720,9 @@ dentry->d_parent->d_name.name, dentry->d - #endif - goto repeat; - } -+ -+ if (!found && racecheck != NULL) -+ shrink_dcache_racecheck(dentry, racecheck); - } - /* - * All done at this level ... ascend and resume the search. -@@ -619,7 +737,6 @@ this_parent->d_parent->d_name.name, this - goto resume; - } - out: -- spin_unlock(&dcache_lock); - return found; - } - -@@ -632,10 +749,66 @@ out: - - void shrink_dcache_parent(struct dentry * parent) - { -- int found; -+ int found, r; -+ -+ while (1) { -+ spin_lock(&dcache_lock); -+ found = select_parent(parent, NULL); -+ if (found) -+ goto found; - -- while ((found = select_parent(parent)) != 0) -+ /* -+ * try again with a dput_recursive() race check. -+ * it returns quickly if everything was really shrinked -+ */ -+ r = 0; -+ found = select_parent(parent, &r); -+ if (found) -+ goto found; -+ if (!r) -+ break; -+ -+ /* drops the lock inside */ -+ dcache_shrinker_wait(parent->d_sb); -+ continue; -+ -+found: -+ spin_unlock(&dcache_lock); - prune_dcache(found); -+ } -+ spin_unlock(&dcache_lock); -+} -+ -+/* -+ * Move any unused anon dentries to the end of the unused list. -+ * called under dcache_lock -+ */ -+static int select_anon(struct hlist_head *head, int *racecheck) -+{ -+ struct hlist_node *lp; -+ int found = 0; -+ -+ hlist_for_each(lp, head) { -+ struct dentry *this = hlist_entry(lp, struct dentry, d_hash); -+ if (!list_empty(&this->d_lru)) { -+ dentry_stat.nr_unused--; -+ list_del_init(&this->d_lru); -+ } -+ -+ /* -+ * move only zero ref count dentries to the end -+ * of the unused list for prune_dcache -+ */ -+ if (!atomic_read(&this->d_count)) { -+ list_add_tail(&this->d_lru, &dentry_unused); -+ dentry_stat.nr_unused++; -+ found++; -+ } -+ -+ if (!found && racecheck != NULL) -+ shrink_dcache_racecheck(this, racecheck); -+ } -+ return found; - } - - /** -@@ -648,33 +821,36 @@ void shrink_dcache_parent(struct dentry - * done under dcache_lock. - * - */ --void shrink_dcache_anon(struct hlist_head *head) -+void shrink_dcache_anon(struct super_block *sb) - { -- struct hlist_node *lp; -- int found; -- do { -- found = 0; -+ int found, r; -+ -+ while (1) { - spin_lock(&dcache_lock); -- hlist_for_each(lp, head) { -- struct dentry *this = hlist_entry(lp, struct dentry, d_hash); -- if (!list_empty(&this->d_lru)) { -- dentry_stat.nr_unused--; -- list_del_init(&this->d_lru); -- } -+ found = select_anon(&sb->s_anon, NULL); -+ if (found) -+ goto found; - -- /* -- * move only zero ref count dentries to the end -- * of the unused list for prune_dcache -- */ -- if (!atomic_read(&this->d_count)) { -- list_add_tail(&this->d_lru, &dentry_unused); -- dentry_stat.nr_unused++; -- found++; -- } -- } -+ /* -+ * try again with a dput_recursive() race check. -+ * it returns quickly if everything was really shrinked -+ */ -+ r = 0; -+ found = select_anon(&sb->s_anon, &r); -+ if (found) -+ goto found; -+ if (!r) -+ break; -+ -+ /* drops the lock inside */ -+ dcache_shrinker_wait(sb); -+ continue; -+ -+found: - spin_unlock(&dcache_lock); - prune_dcache(found); -- } while(found); -+ } -+ spin_unlock(&dcache_lock); - } - - /* -@@ -691,12 +867,18 @@ void shrink_dcache_anon(struct hlist_hea - */ - static int shrink_dcache_memory(int nr, gfp_t gfp_mask) - { -+ int res = -1; -+ -+ KSTAT_PERF_ENTER(shrink_dcache) - if (nr) { - if (!(gfp_mask & __GFP_FS)) -- return -1; -+ goto out; - prune_dcache(nr); - } -- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; -+ res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; -+out: -+ KSTAT_PERF_LEAVE(shrink_dcache) -+ return res; - } - - /** -@@ -716,19 +898,20 @@ struct dentry *d_alloc(struct dentry * p - - dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); - if (!dentry) -- return NULL; -+ goto err_alloc; - - if (name->len > DNAME_INLINE_LEN-1) { - dname = kmalloc(name->len + 1, GFP_KERNEL); -- if (!dname) { -- kmem_cache_free(dentry_cache, dentry); -- return NULL; -- } -+ if (!dname) -+ goto err_name; - } else { - dname = dentry->d_iname; - } - dentry->d_name.name = dname; - -+ if (ub_dentry_alloc(dentry)) -+ goto err_charge; -+ - dentry->d_name.len = name->len; - dentry->d_name.hash = name->hash; - memcpy(dname, name->name, name->len); -@@ -759,12 +942,23 @@ struct dentry *d_alloc(struct dentry * p - } - - spin_lock(&dcache_lock); -- if (parent) -+ if (parent) { - list_add(&dentry->d_u.d_child, &parent->d_subdirs); -+ if (parent->d_flags & DCACHE_VIRTUAL) -+ dentry->d_flags |= DCACHE_VIRTUAL; -+ } - dentry_stat.nr_dentry++; - spin_unlock(&dcache_lock); - - return dentry; -+ -+err_charge: -+ if (name->len > DNAME_INLINE_LEN - 1) -+ kfree(dname); -+err_name: -+ kmem_cache_free(dentry_cache, dentry); -+err_alloc: -+ return NULL; - } - - struct dentry *d_alloc_name(struct dentry *parent, const char *name) -@@ -1048,7 +1242,6 @@ struct dentry * __d_lookup(struct dentry - unsigned int hash = name->hash; - const unsigned char *str = name->name; - struct hlist_head *head = d_hash(parent,hash); -- struct dentry *found = NULL; - struct hlist_node *node; - struct dentry *dentry; - -@@ -1089,7 +1282,7 @@ struct dentry * __d_lookup(struct dentry - - if (!d_unhashed(dentry)) { - atomic_inc(&dentry->d_count); -- found = dentry; -+ goto found; - } - spin_unlock(&dentry->d_lock); - break; -@@ -1098,7 +1291,18 @@ next: - } - rcu_read_unlock(); - -- return found; -+ return NULL; -+ -+found: -+ /* -+ * d_lock and rcu_read_lock -+ * are dropped in ub_dentry_charge() -+ */ -+ if (ub_dentry_charge(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; - } - - /** -@@ -1345,6 +1549,32 @@ already_unhashed: - } - - /** -+ * __d_path_add_deleted - prepend "(deleted) " text -+ * @end: a pointer to the character after free space at the beginning of the -+ * buffer -+ * @buflen: remaining free space -+ */ -+static inline char * __d_path_add_deleted(char * end, int buflen) -+{ -+ buflen -= 10; -+ if (buflen < 0) -+ return ERR_PTR(-ENAMETOOLONG); -+ end -= 10; -+ memcpy(end, "(deleted) ", 10); -+ return end; -+} -+ -+/** -+ * d_root_check - checks if dentry is accessible from current's fs root -+ * @dentry: dentry to be verified -+ * @vfsmnt: vfsmnt to which the dentry belongs -+ */ -+int d_root_check(struct dentry *dentry, struct vfsmount *vfsmnt) -+{ -+ return PTR_ERR(d_path(dentry, vfsmnt, NULL, 0)); -+} -+ -+/** - * d_path - return the path of a dentry - * @dentry: dentry to report - * @vfsmnt: vfsmnt to which the dentry belongs -@@ -1365,36 +1595,35 @@ static char * __d_path( struct dentry *d - char *buffer, int buflen) - { - char * end = buffer+buflen; -- char * retval; -+ char * retval = NULL; - int namelen; -+ int deleted; -+ struct vfsmount *oldvfsmnt; - -- *--end = '\0'; -- buflen--; -- if (!IS_ROOT(dentry) && d_unhashed(dentry)) { -- buflen -= 10; -- end -= 10; -- if (buflen < 0) -+ oldvfsmnt = vfsmnt; -+ deleted = (!IS_ROOT(dentry) && d_unhashed(dentry)); -+ if (buffer != NULL) { -+ *--end = '\0'; -+ buflen--; -+ -+ if (buflen < 1) - goto Elong; -- memcpy(end, " (deleted)", 10); -+ /* Get '/' right */ -+ retval = end-1; -+ *retval = '/'; - } - -- if (buflen < 1) -- goto Elong; -- /* Get '/' right */ -- retval = end-1; -- *retval = '/'; -- - for (;;) { - struct dentry * parent; - - if (dentry == root && vfsmnt == rootmnt) - break; - if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { -- /* Global root? */ -+ /* root of a tree? */ - spin_lock(&vfsmount_lock); - if (vfsmnt->mnt_parent == vfsmnt) { - spin_unlock(&vfsmount_lock); -- goto global_root; -+ goto other_root; - } - dentry = vfsmnt->mnt_mountpoint; - vfsmnt = vfsmnt->mnt_parent; -@@ -1403,27 +1632,51 @@ static char * __d_path( struct dentry *d - } - parent = dentry->d_parent; - prefetch(parent); -+ if (buffer != NULL) { -+ namelen = dentry->d_name.len; -+ buflen -= namelen + 1; -+ if (buflen < 0) -+ goto Elong; -+ end -= namelen; -+ memcpy(end, dentry->d_name.name, namelen); -+ *--end = '/'; -+ retval = end; -+ } -+ dentry = parent; -+ } -+ /* the given root point is reached */ -+finish: -+ if (buffer != NULL && deleted) -+ retval = __d_path_add_deleted(end, buflen); -+ return retval; -+ -+other_root: -+ /* -+ * We traversed the tree upward and reached a root, but the given -+ * lookup terminal point wasn't encountered. It means either that the -+ * dentry is out of our scope or belongs to an abstract space like -+ * sock_mnt or pipe_mnt. Check for it. -+ * -+ * There are different options to check it. -+ * We may assume that any dentry tree is unreachable unless it's -+ * connected to `root' (defined as fs root of init aka child reaper) -+ * and expose all paths that are not connected to it. -+ * The other option is to allow exposing of known abstract spaces -+ * explicitly and hide the path information for other cases. -+ * This approach is more safe, let's take it. 2001/04/22 SAW -+ */ -+ if (!(oldvfsmnt->mnt_sb->s_flags & MS_NOUSER)) -+ return ERR_PTR(-EINVAL); -+ if (buffer != NULL) { - namelen = dentry->d_name.len; -- buflen -= namelen + 1; -+ buflen -= namelen; - if (buflen < 0) - goto Elong; -- end -= namelen; -- memcpy(end, dentry->d_name.name, namelen); -- *--end = '/'; -- retval = end; -- dentry = parent; -+ retval -= namelen-1; /* hit the slash */ -+ memcpy(retval, dentry->d_name.name, namelen); - } -+ goto finish; - -- return retval; -- --global_root: -- namelen = dentry->d_name.len; -- buflen -= namelen; -- if (buflen < 0) -- goto Elong; -- retval -= namelen-1; /* hit the slash */ -- memcpy(retval, dentry->d_name.name, namelen); -- return retval; - Elong: - return ERR_PTR(-ENAMETOOLONG); - } -@@ -1448,6 +1701,228 @@ char * d_path(struct dentry *dentry, str - return res; - } - -+#ifdef CONFIG_VE -+#include <net/sock.h> -+#include <linux/ip.h> -+#include <linux/file.h> -+#include <linux/namespace.h> -+#include <linux/vzratelimit.h> -+ -+static void mark_sub_tree_virtual(struct dentry *d) -+{ -+ struct dentry *orig_root; -+ -+ orig_root = d; -+ while (1) { -+ spin_lock(&d->d_lock); -+ d->d_flags |= DCACHE_VIRTUAL; -+ spin_unlock(&d->d_lock); -+ -+ if (!list_empty(&d->d_subdirs)) { -+ d = list_entry(d->d_subdirs.next, -+ struct dentry, d_u.d_child); -+ continue; -+ } -+ if (d == orig_root) -+ break; -+ while (d == list_entry(d->d_parent->d_subdirs.prev, -+ struct dentry, d_u.d_child)) { -+ d = d->d_parent; -+ if (d == orig_root) -+ goto out; -+ } -+ d = list_entry(d->d_u.d_child.next, -+ struct dentry, d_u.d_child); -+ } -+out: -+ return; -+} -+ -+void mark_tree_virtual(struct vfsmount *m, struct dentry *d) -+{ -+ struct vfsmount *orig_rootmnt; -+ -+ spin_lock(&dcache_lock); -+ spin_lock(&vfsmount_lock); -+ orig_rootmnt = m; -+ while (1) { -+ mark_sub_tree_virtual(d); -+ if (!list_empty(&m->mnt_mounts)) { -+ m = list_entry(m->mnt_mounts.next, -+ struct vfsmount, mnt_child); -+ d = m->mnt_root; -+ continue; -+ } -+ if (m == orig_rootmnt) -+ break; -+ while (m == list_entry(m->mnt_parent->mnt_mounts.prev, -+ struct vfsmount, mnt_child)) { -+ m = m->mnt_parent; -+ if (m == orig_rootmnt) -+ goto out; -+ } -+ m = list_entry(m->mnt_child.next, -+ struct vfsmount, mnt_child); -+ d = m->mnt_root; -+ } -+out: -+ spin_unlock(&vfsmount_lock); -+ spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(mark_tree_virtual); -+ -+static struct vz_rate_info area_ri = { 20, 10*HZ }; -+#define VE_AREA_ACC_CHECK 0x0001 -+#define VE_AREA_ACC_DENY 0x0002 -+#define VE_AREA_EXEC_CHECK 0x0010 -+#define VE_AREA_EXEC_DENY 0x0020 -+#define VE0_AREA_ACC_CHECK 0x0100 -+#define VE0_AREA_ACC_DENY 0x0200 -+#define VE0_AREA_EXEC_CHECK 0x1000 -+#define VE0_AREA_EXEC_DENY 0x2000 -+int ve_area_access_check = 0; -+ -+static void print_connection_info(struct task_struct *tsk) -+{ -+ struct files_struct *files; -+ struct fdtable *fdt; -+ int fd; -+ -+ files = get_files_struct(tsk); -+ if (!files) -+ return; -+ -+ spin_lock(&files->file_lock); -+ fdt = files_fdtable(files); -+ for (fd = 0; fd < fdt->max_fds; fd++) { -+ struct file *file; -+ struct inode *inode; -+ struct socket *socket; -+ struct sock *sk; -+ struct inet_sock *inet; -+ -+ file = fdt->fd[fd]; -+ if (file == NULL) -+ continue; -+ -+ inode = file->f_dentry->d_inode; -+ if (!S_ISSOCK(inode->i_mode)) -+ continue; -+ -+ socket = SOCKET_I(inode); -+ if (socket == NULL) -+ continue; -+ -+ sk = socket->sk; -+ if (sk->sk_family != PF_INET || sk->sk_type != SOCK_STREAM) -+ continue; -+ -+ inet = inet_sk(sk); -+ printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n", -+ NIPQUAD(inet->daddr), ntohs(inet->dport), -+ inet->num); -+ } -+ spin_unlock(&files->file_lock); -+ put_files_struct(files); -+} -+ -+static void check_alert(struct vfsmount *vfsmnt, struct dentry *dentry, -+ char *str) -+{ -+ struct task_struct *tsk; -+ unsigned long page; -+ struct super_block *sb; -+ char *p; -+ -+ if (!vz_ratelimit(&area_ri)) -+ return; -+ -+ tsk = current; -+ p = ERR_PTR(-ENOMEM); -+ page = __get_free_page(GFP_KERNEL); -+ if (page) { -+ spin_lock(&dcache_lock); -+ p = __d_path(dentry, vfsmnt, tsk->fs->root, tsk->fs->rootmnt, -+ (char *)page, PAGE_SIZE); -+ spin_unlock(&dcache_lock); -+ } -+ if (IS_ERR(p)) -+ p = "(undefined)"; -+ -+ sb = dentry->d_sb; -+ printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n" -+ "Task %d/%d[%s] from VE%d, execenv %d\n", -+ str, p, VE_OWNER_FSTYPE(sb->s_type)->veid, -+ sb->s_type->name, sb->s_dev, -+ tsk->pid, virt_pid(tsk), tsk->comm, -+ VE_TASK_INFO(tsk)->owner_env->veid, -+ get_exec_env()->veid); -+ -+ free_page(page); -+ -+ print_connection_info(tsk); -+ -+ read_lock(&tasklist_lock); -+ tsk = tsk->real_parent; -+ get_task_struct(tsk); -+ read_unlock(&tasklist_lock); -+ -+ printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n", -+ tsk->pid, virt_pid(tsk), tsk->comm, -+ VE_TASK_INFO(tsk)->owner_env->veid); -+ -+ print_connection_info(tsk); -+ put_task_struct(tsk); -+ dump_stack(); -+} -+#endif -+ -+int check_area_access_ve(struct dentry *dentry, struct vfsmount *mnt) -+{ -+#ifdef CONFIG_VE -+ int check, alert, deny; -+ -+ if (ve_is_super(get_exec_env())) { -+ check = ve_area_access_check & VE0_AREA_ACC_CHECK; -+ alert = dentry->d_flags & DCACHE_VIRTUAL; -+ deny = ve_area_access_check & VE0_AREA_ACC_DENY; -+ } else { -+ check = ve_area_access_check & VE_AREA_ACC_CHECK; -+ alert = !(dentry->d_flags & DCACHE_VIRTUAL); -+ deny = ve_area_access_check & VE_AREA_ACC_DENY; -+ } -+ -+ if (check && alert) -+ check_alert(mnt, dentry, "Access"); -+ if (deny && alert) -+ return -EACCES; -+#endif -+ return 0; -+} -+ -+int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt) -+{ -+#ifdef CONFIG_VE -+ int check, alert, deny; -+ -+ if (ve_is_super(get_exec_env())) { -+ check = ve_area_access_check & VE0_AREA_EXEC_CHECK; -+ alert = dentry->d_flags & DCACHE_VIRTUAL; -+ deny = ve_area_access_check & VE0_AREA_EXEC_DENY; -+ } else { -+ check = ve_area_access_check & VE_AREA_EXEC_CHECK; -+ alert = !(dentry->d_flags & DCACHE_VIRTUAL); -+ deny = ve_area_access_check & VE_AREA_EXEC_DENY; -+ } -+ -+ if (check && alert) -+ check_alert(mnt, dentry, "Exec"); -+ if (deny && alert) -+ return -EACCES; -+#endif -+ return 0; -+} -+ - /* - * NOTE! The user-level library version returns a - * character pointer. The kernel system call just -@@ -1584,10 +2059,12 @@ resume: - goto repeat; - } - atomic_dec(&dentry->d_count); -+ ub_dentry_uncharge(dentry); - } - if (this_parent != root) { - next = this_parent->d_u.d_child.next; - atomic_dec(&this_parent->d_count); -+ ub_dentry_uncharge(this_parent); - this_parent = this_parent->d_parent; - goto resume; - } -@@ -1736,7 +2213,8 @@ void __init vfs_caches_init(unsigned lon - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, -- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); -+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, -+ NULL, NULL); - - dcache_init(mempages); - inode_init(mempages); -diff -upr linux-2.6.16.orig/fs/devpts/inode.c linux-2.6.16-026test009/fs/devpts/inode.c ---- linux-2.6.16.orig/fs/devpts/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/devpts/inode.c 2006-04-19 15:02:12.000000000 +0400 -@@ -12,6 +12,7 @@ - - #include <linux/module.h> - #include <linux/init.h> -+#include <linux/ve.h> - #include <linux/fs.h> - #include <linux/sched.h> - #include <linux/namei.h> -@@ -21,16 +22,17 @@ - - #define DEVPTS_SUPER_MAGIC 0x1cd1 - -+struct devpts_config devpts_config = {.mode = 0600}; -+ -+#ifndef CONFIG_VE - static struct vfsmount *devpts_mnt; - static struct dentry *devpts_root; -- --static struct { -- int setuid; -- int setgid; -- uid_t uid; -- gid_t gid; -- umode_t mode; --} config = {.mode = 0600}; -+#define config devpts_config -+#else -+#define devpts_mnt (get_exec_env()->devpts_mnt) -+#define devpts_root (get_exec_env()->devpts_root) -+#define config (*(get_exec_env()->devpts_config)) -+#endif - - static int devpts_remount(struct super_block *sb, int *flags, char *data) - { -@@ -56,7 +58,8 @@ static int devpts_remount(struct super_b - } else if (sscanf(this_char, "mode=%o%c", &n, &dummy) == 1) - mode = n & ~S_IFMT; - else { -- printk("devpts: called with bogus options\n"); -+ ve_printk(VE_LOG, -+ "devpts: called with bogus options\n"); - return -EINVAL; - } - } -@@ -114,13 +117,15 @@ static struct super_block *devpts_get_sb - return get_sb_single(fs_type, flags, data, devpts_fill_super); - } - --static struct file_system_type devpts_fs_type = { -+struct file_system_type devpts_fs_type = { - .owner = THIS_MODULE, - .name = "devpts", - .get_sb = devpts_get_sb, - .kill_sb = kill_anon_super, - }; - -+EXPORT_SYMBOL(devpts_fs_type); -+ - /* - * The normal naming convention is simply /dev/pts/<number>; this conforms - * to the System V naming convention -@@ -212,6 +217,7 @@ static int __init init_devpts_fs(void) - - static void __exit exit_devpts_fs(void) - { -+ /* the code is never called, the argument is irrelevant */ - unregister_filesystem(&devpts_fs_type); - mntput(devpts_mnt); - } -diff -upr linux-2.6.16.orig/fs/eventpoll.c linux-2.6.16-026test009/fs/eventpoll.c ---- linux-2.6.16.orig/fs/eventpoll.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/eventpoll.c 2006-04-19 15:02:12.000000000 +0400 -@@ -105,11 +105,6 @@ - #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) - - --struct epoll_filefd { -- struct file *file; -- int fd; --}; -- - /* - * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". - * It is used to keep track on all tasks that are currently inside the wake_up() code -@@ -132,36 +127,6 @@ struct poll_safewake { - spinlock_t lock; - }; - --/* -- * This structure is stored inside the "private_data" member of the file -- * structure and rapresent the main data sructure for the eventpoll -- * interface. -- */ --struct eventpoll { -- /* Protect the this structure access */ -- rwlock_t lock; -- -- /* -- * This semaphore is used to ensure that files are not removed -- * while epoll is using them. This is read-held during the event -- * collection loop and it is write-held during the file cleanup -- * path, the epoll file exit code and the ctl operations. -- */ -- struct rw_semaphore sem; -- -- /* Wait queue used by sys_epoll_wait() */ -- wait_queue_head_t wq; -- -- /* Wait queue used by file->poll() */ -- wait_queue_head_t poll_wait; -- -- /* List of ready file descriptors */ -- struct list_head rdllist; -- -- /* RB-Tree root used to store monitored fd structs */ -- struct rb_root rbr; --}; -- - /* Wait structure used by the poll hooks */ - struct eppoll_entry { - /* List header used to link this structure to the "struct epitem" */ -@@ -180,51 +145,6 @@ struct eppoll_entry { - wait_queue_head_t *whead; - }; - --/* -- * Each file descriptor added to the eventpoll interface will -- * have an entry of this type linked to the hash. -- */ --struct epitem { -- /* RB-Tree node used to link this structure to the eventpoll rb-tree */ -- struct rb_node rbn; -- -- /* List header used to link this structure to the eventpoll ready list */ -- struct list_head rdllink; -- -- /* The file descriptor information this item refers to */ -- struct epoll_filefd ffd; -- -- /* Number of active wait queue attached to poll operations */ -- int nwait; -- -- /* List containing poll wait queues */ -- struct list_head pwqlist; -- -- /* The "container" of this item */ -- struct eventpoll *ep; -- -- /* The structure that describe the interested events and the source fd */ -- struct epoll_event event; -- -- /* -- * Used to keep track of the usage count of the structure. This avoids -- * that the structure will desappear from underneath our processing. -- */ -- atomic_t usecnt; -- -- /* List header used to link this item to the "struct file" items list */ -- struct list_head fllink; -- -- /* List header used to link the item to the transfer list */ -- struct list_head txlink; -- -- /* -- * This is used during the collection/transfer of events to userspace -- * to pin items empty events set. -- */ -- unsigned int revents; --}; -- - /* Wrapper struct used by poll queueing */ - struct ep_pqueue { - poll_table pt; -@@ -239,14 +159,10 @@ static int ep_getfd(int *efd, struct ino - struct eventpoll *ep); - static int ep_alloc(struct eventpoll **pep); - static void ep_free(struct eventpoll *ep); --static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); - static void ep_use_epitem(struct epitem *epi); --static void ep_release_epitem(struct epitem *epi); - static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); - static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi); --static int ep_insert(struct eventpoll *ep, struct epoll_event *event, -- struct file *tfile, int fd); - static int ep_modify(struct eventpoll *ep, struct epitem *epi, - struct epoll_event *event); - static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi); -@@ -274,7 +190,8 @@ static struct super_block *eventpollfs_g - /* - * This semaphore is used to serialize ep_free() and eventpoll_release_file(). - */ --static struct semaphore epsem; -+struct semaphore epsem; -+EXPORT_SYMBOL_GPL(epsem); - - /* Safe wake up implementation */ - static struct poll_safewake psw; -@@ -293,6 +210,7 @@ static struct file_operations eventpoll_ - .release = ep_eventpoll_close, - .poll = ep_eventpoll_poll - }; -+EXPORT_SYMBOL_GPL(eventpoll_fops); - - /* - * This is used to register the virtual file system from where -@@ -542,7 +460,7 @@ eexit_1: - current, size, error)); - return error; - } -- -+EXPORT_SYMBOL_GPL(sys_epoll_create); - - /* - * The following function implements the controller interface for -@@ -852,7 +770,7 @@ static void ep_free(struct eventpoll *ep - * the returned item, so the caller must call ep_release_epitem() - * after finished using the "struct epitem". - */ --static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) -+struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) - { - int kcmp; - unsigned long flags; -@@ -882,6 +800,7 @@ static struct epitem *ep_find(struct eve - - return epir; - } -+EXPORT_SYMBOL_GPL(ep_find); - - - /* -@@ -900,13 +819,13 @@ static void ep_use_epitem(struct epitem - * has finished using the structure. It might lead to freeing the - * structure itself if the count goes to zero. - */ --static void ep_release_epitem(struct epitem *epi) -+void ep_release_epitem(struct epitem *epi) - { - - if (atomic_dec_and_test(&epi->usecnt)) - kmem_cache_free(epi_cache, epi); - } -- -+EXPORT_SYMBOL_GPL(ep_release_epitem); - - /* - * This is the callback that is used to add our wait queue to the -@@ -952,7 +871,7 @@ static void ep_rbtree_insert(struct even - } - - --static int ep_insert(struct eventpoll *ep, struct epoll_event *event, -+int ep_insert(struct eventpoll *ep, struct epoll_event *event, - struct file *tfile, int fd) - { - int error, revents, pwake = 0; -@@ -1044,6 +963,7 @@ eexit_2: - eexit_1: - return error; - } -+EXPORT_SYMBOL_GPL(ep_insert); - - - /* -diff -upr linux-2.6.16.orig/fs/exec.c linux-2.6.16-026test009/fs/exec.c ---- linux-2.6.16.orig/fs/exec.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/exec.c 2006-04-19 15:02:12.000000000 +0400 -@@ -53,6 +53,8 @@ - #include <asm/uaccess.h> - #include <asm/mmu_context.h> - -+#include <ub/ub_vmpages.h> -+ - #ifdef CONFIG_KMOD - #include <linux/kmod.h> - #endif -@@ -64,6 +66,8 @@ int suid_dumpable = 0; - EXPORT_SYMBOL(suid_dumpable); - /* The maximal length of core_pattern is also specified in sysctl.c */ - -+int sysctl_at_vsyscall; -+ - static struct linux_binfmt *formats; - static DEFINE_RWLOCK(binfmt_lock); - -@@ -135,7 +139,7 @@ asmlinkage long sys_uselib(const char __ - if (!S_ISREG(nd.dentry->d_inode->i_mode)) - goto exit; - -- error = vfs_permission(&nd, MAY_READ | MAY_EXEC); -+ error = vfs_permission(&nd, MAY_READ | MAY_EXEC, NULL); - if (error) - goto exit; - -@@ -308,6 +312,10 @@ void install_arg_page(struct vm_area_str - struct mm_struct *mm = vma->vm_mm; - pte_t * pte; - spinlock_t *ptl; -+ struct page_beancounter *pb; -+ -+ if (unlikely(pb_alloc(&pb))) -+ goto out_nopb; - - if (unlikely(anon_vma_prepare(vma))) - goto out; -@@ -321,15 +329,21 @@ void install_arg_page(struct vm_area_str - goto out; - } - inc_mm_counter(mm, anon_rss); -+ inc_vma_rss(vma); - lru_cache_add_active(page); - set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( - page, vma->vm_page_prot)))); -+ pb_add_ref(page, mm, &pb); -+ ub_unused_privvm_dec(mm, vma); -+ pb_free(&pb); - page_add_new_anon_rmap(page, vma, address); - pte_unmap_unlock(pte, ptl); - - /* no need for flush_tlb */ - return; - out: -+ pb_free(&pb); -+out_nopb: - __free_page(page); - force_sig(SIGKILL, current); - } -@@ -404,9 +418,13 @@ int setup_arg_pages(struct linux_binprm - bprm->loader += stack_base; - bprm->exec += stack_base; - -- mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); -+ if (ub_memory_charge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, -+ NULL, UB_SOFT)) -+ goto fail_charge; -+ -+ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | __GFP_SOFT_UBC); - if (!mpnt) -- return -ENOMEM; -+ goto fail_alloc; - - memset(mpnt, 0, sizeof(*mpnt)); - -@@ -450,6 +468,11 @@ int setup_arg_pages(struct linux_binprm - up_write(&mm->mmap_sem); - - return 0; -+ -+fail_alloc: -+ ub_memory_uncharge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, NULL); -+fail_charge: -+ return -ENOMEM; - } - - EXPORT_SYMBOL(setup_arg_pages); -@@ -471,7 +494,7 @@ static inline void free_arg_pages(struct - - #endif /* CONFIG_MMU */ - --struct file *open_exec(const char *name) -+struct file *open_exec(const char *name, struct linux_binprm *bprm) - { - struct nameidata nd; - int err; -@@ -485,9 +508,16 @@ struct file *open_exec(const char *name) - file = ERR_PTR(-EACCES); - if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && - S_ISREG(inode->i_mode)) { -- int err = vfs_permission(&nd, MAY_EXEC); -- if (!err && !(inode->i_mode & 0111)) -- err = -EACCES; -+ int err; -+ struct exec_perm *perm; -+ -+ if (bprm != NULL) { -+ perm = &bprm->perm; -+ perm->set = 0; -+ } else -+ perm = NULL; -+ -+ err = vfs_permission(&nd, MAY_EXEC, perm); - file = ERR_PTR(err); - if (!err) { - file = nameidata_to_filp(&nd, O_RDONLY); -@@ -657,7 +687,7 @@ static int de_thread(struct task_struct - */ - if (!thread_group_leader(current)) { - struct task_struct *parent; -- struct dentry *proc_dentry1, *proc_dentry2; -+ struct dentry *proc_dentry1[2], *proc_dentry2[2]; - unsigned long ptrace; - - /* -@@ -671,8 +701,8 @@ static int de_thread(struct task_struct - - spin_lock(&leader->proc_lock); - spin_lock(¤t->proc_lock); -- proc_dentry1 = proc_pid_unhash(current); -- proc_dentry2 = proc_pid_unhash(leader); -+ proc_pid_unhash(current, proc_dentry1); -+ proc_pid_unhash(leader, proc_dentry2); - write_lock_irq(&tasklist_lock); - - BUG_ON(leader->tgid != current->tgid); -@@ -829,7 +859,7 @@ int flush_old_exec(struct linux_binprm * - { - char * name; - int i, ch, retval; -- struct files_struct *files; -+ struct files_struct *files, *old; - char tcomm[sizeof(current->comm)]; - - /* -@@ -897,6 +927,7 @@ int flush_old_exec(struct linux_binprm * - suid_keys(current); - current->mm->dumpable = suid_dumpable; - } -+ current->mm->vps_dumpable = 1; - - /* An exec changes our domain. We are no longer part of the thread - group */ -@@ -909,8 +940,11 @@ int flush_old_exec(struct linux_binprm * - return 0; - - mmap_failed: -- put_files_struct(current->files); -+ old = current->files; -+ task_lock(current); - current->files = files; -+ task_unlock(current); -+ put_files_struct(old); - out: - return retval; - } -@@ -927,13 +961,6 @@ int prepare_binprm(struct linux_binprm * - struct inode * inode = bprm->file->f_dentry->d_inode; - int retval; - -- mode = inode->i_mode; -- /* -- * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, -- * generic_permission lets a non-executable through -- */ -- if (!(mode & 0111)) /* with at least _one_ execute bit set */ -- return -EACCES; - if (bprm->file->f_op == NULL) - return -EACCES; - -@@ -941,10 +968,24 @@ int prepare_binprm(struct linux_binprm * - bprm->e_gid = current->egid; - - if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { -+ if (!bprm->perm.set) { -+ /* -+ * This piece of code creates a time window between -+ * MAY_EXEC permission check and setuid/setgid -+ * operations and may be considered as a security hole. -+ * This code is here for compatibility reasons, -+ * if the filesystem is unable to return info now. -+ */ -+ bprm->perm.mode = inode->i_mode; -+ bprm->perm.uid = inode->i_uid; -+ bprm->perm.gid = inode->i_gid; -+ } -+ mode = bprm->perm.mode; -+ - /* Set-uid? */ - if (mode & S_ISUID) { - current->personality &= ~PER_CLEAR_ON_SETID; -- bprm->e_uid = inode->i_uid; -+ bprm->e_uid = bprm->perm.uid; - } - - /* Set-gid? */ -@@ -955,7 +996,7 @@ int prepare_binprm(struct linux_binprm * - */ - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - current->personality &= ~PER_CLEAR_ON_SETID; -- bprm->e_gid = inode->i_gid; -+ bprm->e_gid = bprm->perm.gid; - } - } - -@@ -1054,7 +1095,7 @@ int search_binary_handler(struct linux_b - - loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - -- file = open_exec("/sbin/loader"); -+ file = open_exec("/sbin/loader", bprm); - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; -@@ -1148,7 +1189,7 @@ int do_execve(char * filename, - goto out_ret; - memset(bprm, 0, sizeof(*bprm)); - -- file = open_exec(filename); -+ file = open_exec(filename, bprm); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_kfree; -@@ -1288,7 +1329,7 @@ static void format_corename(char *corena - case 'p': - pid_in_pattern = 1; - rc = snprintf(out_ptr, out_end - out_ptr, -- "%d", current->tgid); -+ "%d", virt_tgid(current)); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; -@@ -1332,7 +1373,7 @@ static void format_corename(char *corena - case 'h': - down_read(&uts_sem); - rc = snprintf(out_ptr, out_end - out_ptr, -- "%s", system_utsname.nodename); -+ "%s", ve_utsname.nodename); - up_read(&uts_sem); - if (rc > out_end - out_ptr) - goto out; -@@ -1360,7 +1401,7 @@ static void format_corename(char *corena - if (!pid_in_pattern - && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { - rc = snprintf(out_ptr, out_end - out_ptr, -- ".%d", current->tgid); -+ ".%d", virt_tgid(current)); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; -@@ -1386,7 +1427,7 @@ static void zap_threads (struct mm_struc - } - - read_lock(&tasklist_lock); -- do_each_thread(g,p) -+ do_each_thread_ve(g,p) - if (mm == p->mm && p != tsk) { - force_sig_specific(SIGKILL, p); - mm->core_waiters++; -@@ -1394,7 +1435,7 @@ static void zap_threads (struct mm_struc - unlikely(p->parent->mm == mm)) - traced = 1; - } -- while_each_thread(g,p); -+ while_each_thread_ve(g,p); - - read_unlock(&tasklist_lock); - -@@ -1406,12 +1447,12 @@ static void zap_threads (struct mm_struc - * coredump to finish. Detach them so they can both die. - */ - write_lock_irq(&tasklist_lock); -- do_each_thread(g,p) { -+ do_each_thread_ve(g,p) { - if (mm == p->mm && p != tsk && - p->ptrace && p->parent->mm == mm) { - __ptrace_detach(p, 0); - } -- } while_each_thread(g,p); -+ } while_each_thread_ve(g,p); - write_unlock_irq(&tasklist_lock); - } - } -@@ -1447,7 +1488,8 @@ int do_coredump(long signr, int exit_cod - if (!binfmt || !binfmt->core_dump) - goto fail; - down_write(&mm->mmap_sem); -- if (!mm->dumpable) { -+ if (!mm->dumpable || -+ (!mm->vps_dumpable && !ve_is_super(get_exec_env()))) { - up_write(&mm->mmap_sem); - goto fail; - } -diff -upr linux-2.6.16.orig/fs/ext2/acl.c linux-2.6.16-026test009/fs/ext2/acl.c ---- linux-2.6.16.orig/fs/ext2/acl.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ext2/acl.c 2006-04-19 15:02:11.000000000 +0400 -@@ -294,9 +294,10 @@ ext2_check_acl(struct inode *inode, int - } - - int --ext2_permission(struct inode *inode, int mask, struct nameidata *nd) -+ext2_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { -- return generic_permission(inode, mask, ext2_check_acl); -+ return generic_permission(inode, mask, ext2_check_acl, perm); - } - - /* -diff -upr linux-2.6.16.orig/fs/ext2/acl.h linux-2.6.16-026test009/fs/ext2/acl.h ---- linux-2.6.16.orig/fs/ext2/acl.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ext2/acl.h 2006-04-19 15:02:11.000000000 +0400 -@@ -58,7 +58,8 @@ static inline int ext2_acl_count(size_t - #define EXT2_ACL_NOT_CACHED ((void *)-1) - - /* acl.c */ --extern int ext2_permission (struct inode *, int, struct nameidata *); -+extern int ext2_permission (struct inode *, int, struct nameidata *, -+ struct exec_perm *); - extern int ext2_acl_chmod (struct inode *); - extern int ext2_init_acl (struct inode *, struct inode *); - -diff -upr linux-2.6.16.orig/fs/ext2/namei.c linux-2.6.16-026test009/fs/ext2/namei.c ---- linux-2.6.16.orig/fs/ext2/namei.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ext2/namei.c 2006-04-19 15:02:12.000000000 +0400 -@@ -31,6 +31,7 @@ - */ - - #include <linux/pagemap.h> -+#include <linux/quotaops.h> - #include "ext2.h" - #include "xattr.h" - #include "acl.h" -@@ -273,6 +274,8 @@ static int ext2_unlink(struct inode * di - struct page * page; - int err = -ENOENT; - -+ DQUOT_INIT(inode); -+ - de = ext2_find_entry (dir, dentry, &page); - if (!de) - goto out; -@@ -315,6 +318,9 @@ static int ext2_rename (struct inode * o - struct ext2_dir_entry_2 * old_de; - int err = -ENOENT; - -+ if (new_inode) -+ DQUOT_INIT(new_inode); -+ - old_de = ext2_find_entry (old_dir, old_dentry, &old_page); - if (!old_de) - goto out; -diff -upr linux-2.6.16.orig/fs/ext2/super.c linux-2.6.16-026test009/fs/ext2/super.c ---- linux-2.6.16.orig/fs/ext2/super.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ext2/super.c 2006-04-19 15:02:12.000000000 +0400 -@@ -996,7 +996,7 @@ static int ext2_remount (struct super_bl - es = sbi->s_es; - if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != - (old_mount_opt & EXT2_MOUNT_XIP)) && -- invalidate_inodes(sb)) -+ invalidate_inodes(sb, 0)) - ext2_warning(sb, __FUNCTION__, "busy inodes while remounting "\ - "xip remain in cache (no functional problem)"); - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) -@@ -1205,7 +1205,7 @@ static struct file_system_type ext2_fs_t - .name = "ext2", - .get_sb = ext2_get_sb, - .kill_sb = kill_block_super, -- .fs_flags = FS_REQUIRES_DEV, -+ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, - }; - - static int __init init_ext2_fs(void) -diff -upr linux-2.6.16.orig/fs/ext3/acl.c linux-2.6.16-026test009/fs/ext3/acl.c ---- linux-2.6.16.orig/fs/ext3/acl.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ext3/acl.c 2006-04-19 15:02:11.000000000 +0400 -@@ -299,9 +299,10 @@ ext3_check_acl(struct inode *inode, int - } - - int --ext3_permission(struct inode *inode, int mask, struct nameidata *nd) -+ext3_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { -- return generic_permission(inode, mask, ext3_check_acl); -+ return generic_permission(inode, mask, ext3_check_acl, perm); - } - - /* -diff -upr linux-2.6.16.orig/fs/ext3/acl.h linux-2.6.16-026test009/fs/ext3/acl.h ---- linux-2.6.16.orig/fs/ext3/acl.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ext3/acl.h 2006-04-19 15:02:11.000000000 +0400 -@@ -58,7 +58,8 @@ static inline int ext3_acl_count(size_t - #define EXT3_ACL_NOT_CACHED ((void *)-1) - - /* acl.c */ --extern int ext3_permission (struct inode *, int, struct nameidata *); -+extern int ext3_permission (struct inode *, int, struct nameidata *, -+ struct exec_perm *); - extern int ext3_acl_chmod (struct inode *); - extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); - -diff -upr linux-2.6.16.orig/fs/ext3/resize.c linux-2.6.16-026test009/fs/ext3/resize.c ---- linux-2.6.16.orig/fs/ext3/resize.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ext3/resize.c 2006-04-19 15:02:11.000000000 +0400 -@@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block - if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __FUNCTION__, - "multiple resizers run on filesystem!"); -+ unlock_super(sb); - err = -EBUSY; - goto exit_put; - } -diff -upr linux-2.6.16.orig/fs/ext3/super.c linux-2.6.16-026test009/fs/ext3/super.c ---- linux-2.6.16.orig/fs/ext3/super.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ext3/super.c 2006-04-19 15:02:12.000000000 +0400 -@@ -2661,7 +2661,7 @@ static struct file_system_type ext3_fs_t - .name = "ext3", - .get_sb = ext3_get_sb, - .kill_sb = kill_block_super, -- .fs_flags = FS_REQUIRES_DEV, -+ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, - }; - - static int __init init_ext3_fs(void) -diff -upr linux-2.6.16.orig/fs/fcntl.c linux-2.6.16-026test009/fs/fcntl.c ---- linux-2.6.16.orig/fs/fcntl.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/fcntl.c 2006-04-19 15:02:12.000000000 +0400 -@@ -18,6 +18,7 @@ - #include <linux/ptrace.h> - #include <linux/signal.h> - #include <linux/rcupdate.h> -+#include <linux/ve_owner.h> - - #include <asm/poll.h> - #include <asm/siginfo.h> -@@ -190,6 +191,7 @@ out_fput: - fput(file); - goto out; - } -+EXPORT_SYMBOL_GPL(sys_dup2); - - asmlinkage long sys_dup(unsigned int fildes) - { -@@ -254,6 +256,7 @@ static int setfl(int fd, struct file * f - static void f_modown(struct file *filp, unsigned long pid, - uid_t uid, uid_t euid, int force) - { -+ pid = comb_vpid_to_pid(pid); - write_lock_irq(&filp->f_owner.lock); - if (force || !filp->f_owner.pid) { - filp->f_owner.pid = pid; -@@ -320,7 +323,7 @@ static long do_fcntl(int fd, unsigned in - * current syscall conventions, the only way - * to fix this will be in libc. - */ -- err = filp->f_owner.pid; -+ err = comb_pid_to_vpid(filp->f_owner.pid); - force_successful_syscall_return(); - break; - case F_SETOWN: -@@ -472,23 +475,29 @@ static void send_sigio_to_task(struct ta - void send_sigio(struct fown_struct *fown, int fd, int band) - { - struct task_struct *p; -+ struct file *f; -+ struct ve_struct *ve; - int pid; - - read_lock(&fown->lock); - pid = fown->pid; - if (!pid) - goto out_unlock_fown; -+ -+ /* hack: fown's are always embedded in struct file */ -+ f = container_of(fown, struct file, f_owner); -+ ve = VE_OWNER_FILP(f); - - read_lock(&tasklist_lock); - if (pid > 0) { -- p = find_task_by_pid(pid); -- if (p) { -+ p = find_task_by_pid_all(pid); -+ if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) { - send_sigio_to_task(p, fown, fd, band); - } - } else { -- do_each_task_pid(-pid, PIDTYPE_PGID, p) { -+ __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) { - send_sigio_to_task(p, fown, fd, band); -- } while_each_task_pid(-pid, PIDTYPE_PGID, p); -+ } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve); - } - read_unlock(&tasklist_lock); - out_unlock_fown: -@@ -505,6 +514,8 @@ static void send_sigurg_to_task(struct t - int send_sigurg(struct fown_struct *fown) - { - struct task_struct *p; -+ struct file *f; -+ struct ve_struct *ve; - int pid, ret = 0; - - read_lock(&fown->lock); -@@ -513,17 +524,19 @@ int send_sigurg(struct fown_struct *fown - goto out_unlock_fown; - - ret = 1; -+ f = container_of(fown, struct file, f_owner); -+ ve = VE_OWNER_FILP(f); - - read_lock(&tasklist_lock); - if (pid > 0) { -- p = find_task_by_pid(pid); -- if (p) { -+ p = find_task_by_pid_all(pid); -+ if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) { - send_sigurg_to_task(p, fown); - } - } else { -- do_each_task_pid(-pid, PIDTYPE_PGID, p) { -+ __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) { - send_sigurg_to_task(p, fown); -- } while_each_task_pid(-pid, PIDTYPE_PGID, p); -+ } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve); - } - read_unlock(&tasklist_lock); - out_unlock_fown: -diff -upr linux-2.6.16.orig/fs/file.c linux-2.6.16-026test009/fs/file.c ---- linux-2.6.16.orig/fs/file.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/file.c 2006-04-19 15:02:12.000000000 +0400 -@@ -8,6 +8,7 @@ - - #include <linux/fs.h> - #include <linux/mm.h> -+#include <linux/module.h> - #include <linux/time.h> - #include <linux/slab.h> - #include <linux/vmalloc.h> -@@ -18,6 +19,8 @@ - #include <linux/rcupdate.h> - #include <linux/workqueue.h> - -+#include <ub/ub_mem.h> -+ - struct fdtable_defer { - spinlock_t lock; - struct work_struct wq; -@@ -44,9 +47,9 @@ struct file ** alloc_fd_array(int num) - int size = num * sizeof(struct file *); - - if (size <= PAGE_SIZE) -- new_fds = (struct file **) kmalloc(size, GFP_KERNEL); -+ new_fds = (struct file **) ub_kmalloc(size, GFP_KERNEL); - else -- new_fds = (struct file **) vmalloc(size); -+ new_fds = (struct file **) ub_vmalloc(size); - return new_fds; - } - -@@ -212,9 +215,9 @@ fd_set * alloc_fdset(int num) - int size = num / 8; - - if (size <= PAGE_SIZE) -- new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); -+ new_fdset = (fd_set *) ub_kmalloc(size, GFP_KERNEL); - else -- new_fdset = (fd_set *) vmalloc(size); -+ new_fdset = (fd_set *) ub_vmalloc(size); - return new_fdset; - } - -@@ -302,7 +305,7 @@ out: - * both fd array and fdset. It is expected to be called with the - * files_lock held. - */ --static int expand_fdtable(struct files_struct *files, int nr) -+int expand_fdtable(struct files_struct *files, int nr) - __releases(files->file_lock) - __acquires(files->file_lock) - { -@@ -338,6 +341,7 @@ static int expand_fdtable(struct files_s - out: - return error; - } -+EXPORT_SYMBOL_GPL(expand_fdtable); - - /* - * Expand files. -diff -upr linux-2.6.16.orig/fs/file_table.c linux-2.6.16-026test009/fs/file_table.c ---- linux-2.6.16.orig/fs/file_table.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/file_table.c 2006-04-19 15:02:12.000000000 +0400 -@@ -9,6 +9,7 @@ - #include <linux/string.h> - #include <linux/slab.h> - #include <linux/file.h> -+#include <linux/ve_owner.h> - #include <linux/init.h> - #include <linux/module.h> - #include <linux/smp_lock.h> -@@ -25,6 +26,8 @@ - - #include <asm/atomic.h> - -+#include <ub/ub_misc.h> -+ - /* sysctl tunables... */ - struct files_stat_struct files_stat = { - .max_files = NR_FILE -@@ -38,6 +41,8 @@ static struct percpu_counter nr_files __ - static inline void file_free_rcu(struct rcu_head *head) - { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); -+ ub_file_uncharge(f); -+ put_ve(VE_OWNER_FILP(f)); - kmem_cache_free(filp_cachep, f); - } - -@@ -109,6 +114,12 @@ struct file *get_empty_filp(void) - - percpu_counter_inc(&nr_files); - memset(f, 0, sizeof(*f)); -+ -+ if (ub_file_charge(f)) -+ goto fail_ch; -+ -+ SET_VE_OWNER_FILP(f, get_ve(get_exec_env())); -+ - if (security_file_alloc(f)) - goto fail_sec; - -@@ -134,6 +145,10 @@ fail_sec: - file_free(f); - fail: - return NULL; -+ -+fail_ch: -+ kmem_cache_free(filp_cachep, f); -+ return NULL; - } - - EXPORT_SYMBOL(get_empty_filp); -diff -upr linux-2.6.16.orig/fs/filesystems.c linux-2.6.16-026test009/fs/filesystems.c ---- linux-2.6.16.orig/fs/filesystems.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/filesystems.c 2006-04-19 15:02:12.000000000 +0400 -@@ -13,6 +13,7 @@ - #include <linux/init.h> - #include <linux/module.h> - #include <linux/sched.h> /* for 'current' */ -+#include <linux/ve_owner.h> - #include <asm/uaccess.h> - - /* -@@ -22,8 +23,8 @@ - * During the unload module must call unregister_filesystem(). - * We can access the fields of list element if: - * 1) spinlock is held or -- * 2) we hold the reference to the module. -- * The latter can be guaranteed by call of try_module_get(); if it -+ * 2) we hold the reference to the element. -+ * The latter can be guaranteed by call of try_filesystem(); if it - * returned 0 we must skip the element, otherwise we got the reference. - * Once the reference is obtained we can drop the spinlock. - */ -@@ -31,23 +32,51 @@ - static struct file_system_type *file_systems; - static DEFINE_RWLOCK(file_systems_lock); - -+int try_get_filesystem(struct file_system_type *fs) -+{ -+ if (try_module_get(fs->owner)) { -+#ifdef CONFIG_VE -+ get_ve(VE_OWNER_FSTYPE(fs)); -+#endif -+ return 1; -+ } -+ return 0; -+} -+ - /* WARNING: This can be used only if we _already_ own a reference */ - void get_filesystem(struct file_system_type *fs) - { -+#ifdef CONFIG_VE -+ get_ve(VE_OWNER_FSTYPE(fs)); -+#endif - __module_get(fs->owner); - } - - void put_filesystem(struct file_system_type *fs) - { - module_put(fs->owner); -+#ifdef CONFIG_VE -+ put_ve(VE_OWNER_FSTYPE(fs)); -+#endif -+} -+ -+static inline int check_ve_fstype(struct file_system_type *p, -+ struct ve_struct *env) -+{ -+ return ((p->fs_flags & FS_VIRTUALIZED) || -+ ve_accessible_strict(VE_OWNER_FSTYPE(p), env)); - } - --static struct file_system_type **find_filesystem(const char *name) -+static struct file_system_type **find_filesystem(const char *name, -+ struct ve_struct *env) - { - struct file_system_type **p; -- for (p=&file_systems; *p; p=&(*p)->next) -+ for (p=&file_systems; *p; p=&(*p)->next) { -+ if (!check_ve_fstype(*p, env)) -+ continue; - if (strcmp((*p)->name,name) == 0) - break; -+ } - return p; - } - -@@ -74,8 +103,10 @@ int register_filesystem(struct file_syst - if (fs->next) - return -EBUSY; - INIT_LIST_HEAD(&fs->fs_supers); -+ if (VE_OWNER_FSTYPE(fs) == NULL) -+ SET_VE_OWNER_FSTYPE(fs, get_ve0()); - write_lock(&file_systems_lock); -- p = find_filesystem(fs->name); -+ p = find_filesystem(fs->name, VE_OWNER_FSTYPE(fs)); - if (*p) - res = -EBUSY; - else -@@ -132,11 +163,14 @@ static int fs_index(const char __user * - - err = -EINVAL; - read_lock(&file_systems_lock); -- for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { -+ for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) { -+ if (!check_ve_fstype(tmp, get_exec_env())) -+ continue; - if (strcmp(tmp->name,name) == 0) { - err = index; - break; - } -+ index++; - } - read_unlock(&file_systems_lock); - putname(name); -@@ -149,9 +183,15 @@ static int fs_name(unsigned int index, c - int len, res; - - read_lock(&file_systems_lock); -- for (tmp = file_systems; tmp; tmp = tmp->next, index--) -- if (index <= 0 && try_module_get(tmp->owner)) -- break; -+ for (tmp = file_systems; tmp; tmp = tmp->next) { -+ if (!check_ve_fstype(tmp, get_exec_env())) -+ continue; -+ if (!index) { -+ if (try_get_filesystem(tmp)) -+ break; -+ } else -+ index--; -+ } - read_unlock(&file_systems_lock); - if (!tmp) - return -EINVAL; -@@ -169,8 +209,9 @@ static int fs_maxindex(void) - int index; - - read_lock(&file_systems_lock); -- for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) -- ; -+ for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next) -+ if (check_ve_fstype(tmp, get_exec_env())) -+ index++; - read_unlock(&file_systems_lock); - return index; - } -@@ -206,9 +247,10 @@ int get_filesystem_list(char * buf) - read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp && len < PAGE_SIZE - 80) { -- len += sprintf(buf+len, "%s\t%s\n", -- (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", -- tmp->name); -+ if (check_ve_fstype(tmp, get_exec_env())) -+ len += sprintf(buf+len, "%s\t%s\n", -+ (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", -+ tmp->name); - tmp = tmp->next; - } - read_unlock(&file_systems_lock); -@@ -220,14 +262,14 @@ struct file_system_type *get_fs_type(con - struct file_system_type *fs; - - read_lock(&file_systems_lock); -- fs = *(find_filesystem(name)); -- if (fs && !try_module_get(fs->owner)) -+ fs = *(find_filesystem(name, get_exec_env())); -+ if (fs && !try_get_filesystem(fs)) - fs = NULL; - read_unlock(&file_systems_lock); - if (!fs && (request_module("%s", name) == 0)) { - read_lock(&file_systems_lock); -- fs = *(find_filesystem(name)); -- if (fs && !try_module_get(fs->owner)) -+ fs = *(find_filesystem(name, get_exec_env())); -+ if (fs && !try_get_filesystem(fs)) - fs = NULL; - read_unlock(&file_systems_lock); - } -@@ -235,3 +277,5 @@ struct file_system_type *get_fs_type(con - } - - EXPORT_SYMBOL(get_fs_type); -+EXPORT_SYMBOL(get_filesystem); -+EXPORT_SYMBOL(put_filesystem); -diff -upr linux-2.6.16.orig/fs/fuse/dir.c linux-2.6.16-026test009/fs/fuse/dir.c ---- linux-2.6.16.orig/fs/fuse/dir.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/fuse/dir.c 2006-04-19 15:02:11.000000000 +0400 -@@ -708,14 +708,15 @@ static int fuse_access(struct inode *ino - * access request is sent. Execute permission is still checked - * locally based on file mode. - */ --static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) -+static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - struct fuse_conn *fc = get_fuse_conn(inode); - - if (!fuse_allow_task(fc, current)) - return -EACCES; - else if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { -- int err = generic_permission(inode, mask, NULL); -+ int err = generic_permission(inode, mask, NULL, perm); - - /* If permission is denied, try to refresh file - attributes. This is also needed, because the root -@@ -723,7 +724,7 @@ static int fuse_permission(struct inode - if (err == -EACCES) { - err = fuse_do_getattr(inode); - if (!err) -- err = generic_permission(inode, mask, NULL); -+ err = generic_permission(inode, mask, NULL, perm); - } - - /* Note: the opposite of the above test does not -diff -upr linux-2.6.16.orig/fs/fuse/file.c linux-2.6.16-026test009/fs/fuse/file.c ---- linux-2.6.16.orig/fs/fuse/file.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/fuse/file.c 2006-04-19 15:02:11.000000000 +0400 -@@ -397,8 +397,12 @@ static int fuse_readpages(struct file *f - return -EINTR; - - err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); -- if (!err) -- fuse_send_readpages(data.req, file, inode); -+ if (!err) { -+ if (data.req->num_pages) -+ fuse_send_readpages(data.req, file, inode); -+ else -+ fuse_put_request(fc, data.req); -+ } - return err; - } - -diff -upr linux-2.6.16.orig/fs/hfs/inode.c linux-2.6.16-026test009/fs/hfs/inode.c ---- linux-2.6.16.orig/fs/hfs/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/hfs/inode.c 2006-04-19 15:02:11.000000000 +0400 -@@ -520,11 +520,11 @@ void hfs_clear_inode(struct inode *inode - } - - static int hfs_permission(struct inode *inode, int mask, -- struct nameidata *nd) -+ struct nameidata *nd, struct exec_perm *perm) - { - if (S_ISREG(inode->i_mode) && mask & MAY_EXEC) - return 0; -- return generic_permission(inode, mask, NULL); -+ return generic_permission(inode, mask, NULL, perm); - } - - static int hfs_file_open(struct inode *inode, struct file *file) -diff -upr linux-2.6.16.orig/fs/hfsplus/inode.c linux-2.6.16-026test009/fs/hfsplus/inode.c ---- linux-2.6.16.orig/fs/hfsplus/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/hfsplus/inode.c 2006-04-19 15:02:11.000000000 +0400 -@@ -237,7 +237,8 @@ static void hfsplus_set_perms(struct ino - perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev); - } - --static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd) -+static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - /* MAY_EXEC is also used for lookup, if no x bit is set allow lookup, - * open_exec has the same test, so it's still not executable, if a x bit -@@ -245,7 +246,7 @@ static int hfsplus_permission(struct ino - */ - if (S_ISREG(inode->i_mode) && mask & MAY_EXEC && !(inode->i_mode & 0111)) - return 0; -- return generic_permission(inode, mask, NULL); -+ return generic_permission(inode, mask, NULL, perm); - } - - -diff -upr linux-2.6.16.orig/fs/hostfs/hostfs_kern.c linux-2.6.16-026test009/fs/hostfs/hostfs_kern.c ---- linux-2.6.16.orig/fs/hostfs/hostfs_kern.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/hostfs/hostfs_kern.c 2006-04-19 15:02:11.000000000 +0400 -@@ -796,7 +796,8 @@ int hostfs_rename(struct inode *from_ino - return(err); - } - --int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd) -+int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd, -+ struct exec_perm *perm) - { - char *name; - int r = 0, w = 0, x = 0, err; -@@ -814,7 +815,7 @@ int hostfs_permission(struct inode *ino, - err = access_file(name, r, w, x); - kfree(name); - if(!err) -- err = generic_permission(ino, desired, NULL); -+ err = generic_permission(ino, desired, NULL, perm); - return err; - } - -diff -upr linux-2.6.16.orig/fs/hpfs/namei.c linux-2.6.16-026test009/fs/hpfs/namei.c ---- linux-2.6.16.orig/fs/hpfs/namei.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/hpfs/namei.c 2006-04-19 15:02:11.000000000 +0400 -@@ -415,7 +415,7 @@ again: - d_drop(dentry); - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) > 1 || -- permission(inode, MAY_WRITE, NULL) || -+ permission(inode, MAY_WRITE, NULL, NULL) || - !S_ISREG(inode->i_mode) || - get_write_access(inode)) { - spin_unlock(&dentry->d_lock); -diff -upr linux-2.6.16.orig/fs/hugetlbfs/inode.c linux-2.6.16-026test009/fs/hugetlbfs/inode.c ---- linux-2.6.16.orig/fs/hugetlbfs/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/hugetlbfs/inode.c 2006-04-19 15:02:12.000000000 +0400 -@@ -800,7 +800,7 @@ struct file *hugetlb_zero_setup(size_t s - struct inode *inode; - struct dentry *dentry, *root; - struct qstr quick_string; -- char buf[16]; -+ char buf[64]; - - if (!can_do_hugetlb_shm()) - return ERR_PTR(-EPERM); -@@ -812,7 +812,8 @@ struct file *hugetlb_zero_setup(size_t s - return ERR_PTR(-ENOMEM); - - root = hugetlbfs_vfsmount->mnt_root; -- snprintf(buf, 16, "%lu", hugetlbfs_counter()); -+ snprintf(buf, sizeof(buf), "VE%d-%lu", -+ get_exec_env()->veid, hugetlbfs_counter()); - quick_string.name = buf; - quick_string.len = strlen(quick_string.name); - quick_string.hash = 0; -diff -upr linux-2.6.16.orig/fs/inode.c linux-2.6.16-026test009/fs/inode.c ---- linux-2.6.16.orig/fs/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/inode.c 2006-04-19 15:02:12.000000000 +0400 -@@ -9,6 +9,7 @@ - #include <linux/mm.h> - #include <linux/dcache.h> - #include <linux/init.h> -+#include <linux/kernel_stat.h> - #include <linux/quotaops.h> - #include <linux/slab.h> - #include <linux/writeback.h> -@@ -98,13 +99,15 @@ DECLARE_MUTEX(iprune_sem); - */ - struct inodes_stat_t inodes_stat; - --static kmem_cache_t * inode_cachep; -+kmem_cache_t *inode_cachep; -+ -+static struct address_space_operations vfs_empty_aops; -+struct inode_operations vfs_empty_iops; -+static struct file_operations vfs_empty_fops; -+EXPORT_SYMBOL(vfs_empty_iops); - - static struct inode *alloc_inode(struct super_block *sb) - { -- static struct address_space_operations empty_aops; -- static struct inode_operations empty_iops; -- static struct file_operations empty_fops; - struct inode *inode; - - if (sb->s_op->alloc_inode) -@@ -119,8 +122,8 @@ static struct inode *alloc_inode(struct - inode->i_blkbits = sb->s_blocksize_bits; - inode->i_flags = 0; - atomic_set(&inode->i_count, 1); -- inode->i_op = &empty_iops; -- inode->i_fop = &empty_fops; -+ inode->i_op = &vfs_empty_iops; -+ inode->i_fop = &vfs_empty_fops; - inode->i_nlink = 1; - atomic_set(&inode->i_writecount, 0); - inode->i_size = 0; -@@ -144,7 +147,7 @@ static struct inode *alloc_inode(struct - return NULL; - } - -- mapping->a_ops = &empty_aops; -+ mapping->a_ops = &vfs_empty_aops; - mapping->host = inode; - mapping->flags = 0; - mapping_set_gfp_mask(mapping, GFP_HIGHUSER); -@@ -303,13 +306,57 @@ static void dispose_list(struct list_hea - spin_unlock(&inode_lock); - } - -+static void show_header(struct inode *inode) -+{ -+ struct super_block *sb = inode->i_sb; -+ -+ printk("VFS: Busy inodes after unmount. " -+ "sb = %p, fs type = %s, sb count = %d, " -+ "sb->s_root = %s\n", sb, -+ (sb->s_type != NULL) ? sb->s_type->name : "", -+ sb->s_count, -+ (sb->s_root != NULL) ? -+ (char *)sb->s_root->d_name.name : ""); -+} -+ -+static void show_inode(struct list_head *tmp, struct inode *inode) -+{ -+ struct dentry *d; -+ int i; -+ -+ printk("inode = %p, inode->i_count = %d, " -+ "inode->i_nlink = %d, " -+ "inode->i_mode = %d, " -+ "inode->i_state = %ld, " -+ "inode->i_flags = %d, " -+ "inode->i_devices.next = %p, " -+ "inode->i_devices.prev = %p, " -+ "inode->i_ino = %ld\n", -+ tmp, -+ atomic_read(&inode->i_count), -+ inode->i_nlink, -+ inode->i_mode, -+ inode->i_state, -+ inode->i_flags, -+ inode->i_devices.next, -+ inode->i_devices.prev, -+ inode->i_ino); -+ printk("inode dump: "); -+ for (i = 0; i < sizeof(*tmp); i++) -+ printk("%2.2x ", *((u_char *)tmp + i)); -+ printk("\n"); -+ list_for_each_entry(d, &inode->i_dentry, d_alias) -+ printk(" d_alias %s\n", -+ d->d_name.name); -+} -+ - /* - * Invalidate all inodes for a device. - */ --static int invalidate_list(struct list_head *head, struct list_head *dispose) -+static int invalidate_list(struct list_head *head, struct list_head *dispose, int check) - { - struct list_head *next; -- int busy = 0, count = 0; -+ int busy = 0, count = 0, once = 0; - - next = head->next; - for (;;) { -@@ -336,6 +383,14 @@ static int invalidate_list(struct list_h - continue; - } - busy = 1; -+ -+ if (check) { -+ if (once) { -+ once = 0; -+ show_header(inode); -+ } -+ show_inode(tmp, inode); -+ } - } - /* only unused inodes may be cached with i_count zero */ - inodes_stat.nr_unused -= count; -@@ -350,7 +405,7 @@ static int invalidate_list(struct list_h - * fails because there are busy inodes then a non zero value is returned. - * If the discard is successful all the inodes have been discarded. - */ --int invalidate_inodes(struct super_block * sb) -+int invalidate_inodes(struct super_block * sb, int check) - { - int busy; - LIST_HEAD(throw_away); -@@ -358,7 +413,7 @@ int invalidate_inodes(struct super_block - down(&iprune_sem); - spin_lock(&inode_lock); - inotify_unmount_inodes(&sb->s_inodes); -- busy = invalidate_list(&sb->s_inodes, &throw_away); -+ busy = invalidate_list(&sb->s_inodes, &throw_away, check); - spin_unlock(&inode_lock); - - dispose_list(&throw_away); -@@ -382,7 +437,7 @@ int __invalidate_device(struct block_dev - * hold). - */ - shrink_dcache_sb(sb); -- res = invalidate_inodes(sb); -+ res = invalidate_inodes(sb, 0); - drop_super(sb); - } - invalidate_bdev(bdev, 0); -@@ -478,6 +533,7 @@ static void prune_icache(int nr_to_scan) - */ - static int shrink_icache_memory(int nr, gfp_t gfp_mask) - { -+ KSTAT_PERF_ENTER(shrink_icache) - if (nr) { - /* - * Nasty deadlock avoidance. We may hold various FS locks, -@@ -488,6 +544,7 @@ static int shrink_icache_memory(int nr, - return -1; - prune_icache(nr); - } -+ KSTAT_PERF_LEAVE(shrink_icache) - return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; - } - -@@ -737,7 +794,7 @@ EXPORT_SYMBOL(iunique); - struct inode *igrab(struct inode *inode) - { - spin_lock(&inode_lock); -- if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) -+ if (inode && !(inode->i_state & (I_FREEING|I_WILL_FREE))) - __iget(inode); - else - /* -diff -upr linux-2.6.16.orig/fs/inotify.c linux-2.6.16-026test009/fs/inotify.c ---- linux-2.6.16.orig/fs/inotify.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/inotify.c 2006-04-19 15:02:11.000000000 +0400 -@@ -374,7 +374,7 @@ static int find_inode(const char __user - if (error) - return error; - /* you can only watch an inode if you have read permissions on it */ -- error = vfs_permission(nd, MAY_READ); -+ error = vfs_permission(nd, MAY_READ, NULL); - if (error) - path_release(nd); - return error; -diff -upr linux-2.6.16.orig/fs/ioprio.c linux-2.6.16-026test009/fs/ioprio.c ---- linux-2.6.16.orig/fs/ioprio.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ioprio.c 2006-04-19 15:02:12.000000000 +0400 -@@ -53,6 +53,9 @@ asmlinkage long sys_ioprio_set(int which - struct user_struct *user; - int ret; - -+ if (!ve_is_super(get_exec_env())) -+ return -EPERM; -+ - switch (class) { - case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) -@@ -78,18 +81,18 @@ asmlinkage long sys_ioprio_set(int which - if (!who) - p = current; - else -- p = find_task_by_pid(who); -+ p = find_task_by_pid_all(who); - if (p) - ret = set_task_ioprio(p, ioprio); - break; - case IOPRIO_WHO_PGRP: - if (!who) - who = process_group(current); -- do_each_task_pid(who, PIDTYPE_PGID, p) { -+ do_each_task_pid_all(who, PIDTYPE_PGID, p) { - ret = set_task_ioprio(p, ioprio); - if (ret) - break; -- } while_each_task_pid(who, PIDTYPE_PGID, p); -+ } while_each_task_pid_all(who, PIDTYPE_PGID, p); - break; - case IOPRIO_WHO_USER: - if (!who) -@@ -100,13 +103,13 @@ asmlinkage long sys_ioprio_set(int which - if (!user) - break; - -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - if (p->uid != who) - continue; - ret = set_task_ioprio(p, ioprio); - if (ret) - break; -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - - if (who) - free_uid(user); -@@ -131,19 +134,19 @@ asmlinkage long sys_ioprio_get(int which - if (!who) - p = current; - else -- p = find_task_by_pid(who); -+ p = find_task_by_pid_ve(who); - if (p) - ret = p->ioprio; - break; - case IOPRIO_WHO_PGRP: - if (!who) - who = process_group(current); -- do_each_task_pid(who, PIDTYPE_PGID, p) { -+ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { - if (ret == -ESRCH) - ret = p->ioprio; - else - ret = ioprio_best(ret, p->ioprio); -- } while_each_task_pid(who, PIDTYPE_PGID, p); -+ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); - break; - case IOPRIO_WHO_USER: - if (!who) -@@ -154,14 +157,14 @@ asmlinkage long sys_ioprio_get(int which - if (!user) - break; - -- do_each_thread(g, p) { -+ do_each_thread_ve(g, p) { - if (p->uid != user->uid) - continue; - if (ret == -ESRCH) - ret = p->ioprio; - else - ret = ioprio_best(ret, p->ioprio); -- } while_each_thread(g, p); -+ } while_each_thread_ve(g, p); - - if (who) - free_uid(user); -diff -upr linux-2.6.16.orig/fs/jfs/acl.c linux-2.6.16-026test009/fs/jfs/acl.c ---- linux-2.6.16.orig/fs/jfs/acl.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/jfs/acl.c 2006-04-19 15:02:11.000000000 +0400 -@@ -140,9 +140,10 @@ static int jfs_check_acl(struct inode *i - return -EAGAIN; - } - --int jfs_permission(struct inode *inode, int mask, struct nameidata *nd) -+int jfs_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { -- return generic_permission(inode, mask, jfs_check_acl); -+ return generic_permission(inode, mask, jfs_check_acl, perm); - } - - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) -diff -upr linux-2.6.16.orig/fs/jfs/jfs_acl.h linux-2.6.16-026test009/fs/jfs/jfs_acl.h ---- linux-2.6.16.orig/fs/jfs/jfs_acl.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/jfs/jfs_acl.h 2006-04-19 15:02:11.000000000 +0400 -@@ -20,7 +20,7 @@ - - #ifdef CONFIG_JFS_POSIX_ACL - --int jfs_permission(struct inode *, int, struct nameidata *); -+int jfs_permission(struct inode *, int, struct nameidata *, struct exec_perm *); - int jfs_init_acl(tid_t, struct inode *, struct inode *); - int jfs_setattr(struct dentry *, struct iattr *); - -diff -upr linux-2.6.16.orig/fs/lockd/clntproc.c linux-2.6.16-026test009/fs/lockd/clntproc.c ---- linux-2.6.16.orig/fs/lockd/clntproc.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/lockd/clntproc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -130,10 +130,10 @@ static void nlmclnt_setlockargs(struct n - nlmclnt_next_cookie(&argp->cookie); - argp->state = nsm_local_state; - memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh)); -- lock->caller = system_utsname.nodename; -+ lock->caller = ve_utsname.nodename; - lock->oh.data = req->a_owner; - lock->oh.len = sprintf(req->a_owner, "%d@%s", -- current->pid, system_utsname.nodename); -+ current->pid, ve_utsname.nodename); - locks_copy_lock(&lock->fl, fl); - } - -@@ -154,7 +154,7 @@ nlmclnt_setgrantargs(struct nlm_rqst *ca - { - locks_copy_lock(&call->a_args.lock.fl, &lock->fl); - memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); -- call->a_args.lock.caller = system_utsname.nodename; -+ call->a_args.lock.caller = ve_utsname.nodename; - call->a_args.lock.oh.len = lock->oh.len; - - /* set default data area */ -diff -upr linux-2.6.16.orig/fs/lockd/mon.c linux-2.6.16-026test009/fs/lockd/mon.c ---- linux-2.6.16.orig/fs/lockd/mon.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/lockd/mon.c 2006-04-19 15:02:12.000000000 +0400 -@@ -147,7 +147,7 @@ xdr_encode_common(struct rpc_rqst *rqstp - */ - sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); - if (!(p = xdr_encode_string(p, buffer)) -- || !(p = xdr_encode_string(p, system_utsname.nodename))) -+ || !(p = xdr_encode_string(p, ve_utsname.nodename))) - return ERR_PTR(-EIO); - *p++ = htonl(argp->prog); - *p++ = htonl(argp->vers); -diff -upr linux-2.6.16.orig/fs/locks.c linux-2.6.16-026test009/fs/locks.c ---- linux-2.6.16.orig/fs/locks.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/locks.c 2006-04-19 15:02:12.000000000 +0400 -@@ -129,6 +129,8 @@ - #include <asm/semaphore.h> - #include <asm/uaccess.h> - -+#include <ub/ub_misc.h> -+ - #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) - #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) - #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) -@@ -148,11 +150,28 @@ static LIST_HEAD(blocked_list); - static kmem_cache_t *filelock_cache; - - /* Allocate an empty lock structure. */ --static struct file_lock *locks_alloc_lock(void) -+static struct file_lock *locks_alloc_lock(int charge) - { -- return kmem_cache_alloc(filelock_cache, SLAB_KERNEL); -+ struct file_lock *fl; -+ -+ fl = kmem_cache_alloc(filelock_cache, SLAB_KERNEL); -+#ifdef CONFIG_USER_RESOURCE -+ if (fl == NULL) -+ goto out; -+ fl->fl_charged = 0; -+ if (!charge) -+ goto out; -+ if (!ub_flock_charge(fl, 1)) -+ goto out; -+ -+ kmem_cache_free(filelock_cache, fl); -+ fl = NULL; -+out: -+#endif -+ return fl; - } - -+ - /* Free a lock which is not in use. */ - static void locks_free_lock(struct file_lock *fl) - { -@@ -181,6 +200,7 @@ static void locks_free_lock(struct file_ - fl->fl_lmops = NULL; - } - -+ ub_flock_uncharge(fl); - kmem_cache_free(filelock_cache, fl); - } - -@@ -263,7 +283,7 @@ static int flock_make_lock(struct file * - if (type < 0) - return type; - -- fl = locks_alloc_lock(); -+ fl = locks_alloc_lock(type != F_UNLCK); - if (fl == NULL) - return -ENOMEM; - -@@ -451,7 +471,7 @@ static int lease_init(struct file *filp, - /* Allocate a file_lock initialised to this type of lease */ - static int lease_alloc(struct file *filp, int type, struct file_lock **flp) - { -- struct file_lock *fl = locks_alloc_lock(); -+ struct file_lock *fl = locks_alloc_lock(1); - int error; - - if (fl == NULL) -@@ -784,8 +804,11 @@ static int __posix_lock_file(struct inod - * We may need two file_lock structures for this operation, - * so we get them in advance to avoid races. - */ -- new_fl = locks_alloc_lock(); -- new_fl2 = locks_alloc_lock(); -+ if (request->fl_type != F_UNLCK) -+ new_fl = locks_alloc_lock(1); -+ else -+ new_fl = NULL; -+ new_fl2 = locks_alloc_lock(0); - - lock_kernel(); - if (request->fl_type != F_UNLCK) { -@@ -813,7 +836,7 @@ static int __posix_lock_file(struct inod - goto out; - - error = -ENOLCK; /* "no luck" */ -- if (!(new_fl && new_fl2)) -+ if (!((request->fl_type == F_UNLCK || new_fl) && new_fl2)) - goto out; - - /* -@@ -919,19 +942,30 @@ static int __posix_lock_file(struct inod - if (!added) { - if (request->fl_type == F_UNLCK) - goto out; -+ error = -ENOLCK; -+ if (right && (left == right) && ub_flock_charge(new_fl, 1)) -+ goto out; - locks_copy_lock(new_fl, request); - locks_insert_lock(before, new_fl); - new_fl = NULL; -+ error = 0; - } - if (right) { - if (left == right) { - /* The new lock breaks the old one in two pieces, - * so we have to use the second new lock. - */ -+ error = -ENOLCK; -+ if (added && ub_flock_charge(new_fl2, -+ request->fl_type != F_UNLCK)) -+ goto out; -+ /* FIXME move all fl_charged manipulations in ub code */ -+ set_flock_charged(new_fl2); - left = new_fl2; - new_fl2 = NULL; - locks_copy_lock(left, right); - locks_insert_lock(before, left); -+ error = 0; - } - right->fl_start = request->fl_end + 1; - locks_wake_up_blocks(right); -@@ -1538,6 +1572,7 @@ asmlinkage long sys_flock(unsigned int f - out: - return error; - } -+EXPORT_SYMBOL_GPL(sys_flock); - - /* Report the first existing lock that would conflict with l. - * This implements the F_GETLK command of fcntl(). -@@ -1573,7 +1608,7 @@ int fcntl_getlk(struct file *filp, struc - - flock.l_type = F_UNLCK; - if (fl != NULL) { -- flock.l_pid = fl->fl_pid; -+ flock.l_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); - #if BITS_PER_LONG == 32 - /* - * Make sure we can represent the posix lock via -@@ -1605,7 +1640,7 @@ out: - int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock __user *l) - { -- struct file_lock *file_lock = locks_alloc_lock(); -+ struct file_lock *file_lock = locks_alloc_lock(0); - struct flock flock; - struct inode *inode; - int error; -@@ -1727,7 +1762,7 @@ int fcntl_getlk64(struct file *filp, str - - flock.l_type = F_UNLCK; - if (fl != NULL) { -- flock.l_pid = fl->fl_pid; -+ flock.l_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); - flock.l_start = fl->fl_start; - flock.l_len = fl->fl_end == OFFSET_MAX ? 0 : - fl->fl_end - fl->fl_start + 1; -@@ -1748,7 +1783,7 @@ out: - int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock64 __user *l) - { -- struct file_lock *file_lock = locks_alloc_lock(); -+ struct file_lock *file_lock = locks_alloc_lock(0); - struct flock64 flock; - struct inode *inode; - int error; -@@ -1976,7 +2011,9 @@ EXPORT_SYMBOL(posix_unblock_lock); - static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx) - { - struct inode *inode = NULL; -+ unsigned int fl_pid; - -+ fl_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); - if (fl->fl_file != NULL) - inode = fl->fl_file->f_dentry->d_inode; - -@@ -2018,16 +2055,16 @@ static void lock_get_status(char* out, s - } - if (inode) { - #ifdef WE_CAN_BREAK_LSLK_NOW -- out += sprintf(out, "%d %s:%ld ", fl->fl_pid, -+ out += sprintf(out, "%d %s:%ld ", fl_pid, - inode->i_sb->s_id, inode->i_ino); - #else - /* userspace relies on this representation of dev_t ;-( */ -- out += sprintf(out, "%d %02x:%02x:%ld ", fl->fl_pid, -+ out += sprintf(out, "%d %02x:%02x:%ld ", fl_pid, - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), inode->i_ino); - #endif - } else { -- out += sprintf(out, "%d <none>:0 ", fl->fl_pid); -+ out += sprintf(out, "%d <none>:0 ", fl_pid); - } - if (IS_POSIX(fl)) { - if (fl->fl_end == OFFSET_MAX) -@@ -2076,11 +2113,17 @@ int get_locks_status(char *buffer, char - char *q = buffer; - off_t pos = 0; - int i = 0; -+ struct ve_struct *env; - - lock_kernel(); -+ env = get_exec_env(); - list_for_each(tmp, &file_lock_list) { - struct list_head *btmp; - struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link); -+ -+ if (!ve_accessible(VE_OWNER_FILP(fl->fl_file), env)) -+ continue; -+ - lock_get_status(q, fl, ++i, ""); - move_lock_status(&q, &pos, offset); - -@@ -2238,7 +2281,7 @@ EXPORT_SYMBOL(steal_locks); - static int __init filelock_init(void) - { - filelock_cache = kmem_cache_create("file_lock_cache", -- sizeof(struct file_lock), 0, SLAB_PANIC, -+ sizeof(struct file_lock), 0, SLAB_PANIC | SLAB_UBC, - init_once, NULL); - return 0; - } -diff -upr linux-2.6.16.orig/fs/namei.c linux-2.6.16-026test009/fs/namei.c ---- linux-2.6.16.orig/fs/namei.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/namei.c 2006-04-19 15:02:12.000000000 +0400 -@@ -179,7 +179,7 @@ EXPORT_SYMBOL(putname); - * for filesystem access without changing the "normal" uids which - * are used for other things.. - */ --int generic_permission(struct inode *inode, int mask, -+static int __generic_permission(struct inode *inode, int mask, - int (*check_acl)(struct inode *inode, int mask)) - { - umode_t mode = inode->i_mode; -@@ -225,7 +225,26 @@ int generic_permission(struct inode *ino - return -EACCES; - } - --int permission(struct inode *inode, int mask, struct nameidata *nd) -+int generic_permission(struct inode *inode, int mask, -+ int (*check_acl)(struct inode *inode, int mask), -+ struct exec_perm *perm) -+{ -+ int ret; -+ -+ if (perm == NULL) -+ return __generic_permission(inode, mask, check_acl); -+ -+ mutex_lock(&inode->i_mutex); -+ ret = __generic_permission(inode, mask, check_acl); -+ if (!ret) -+ set_exec_perm(perm, inode); -+ mutex_unlock(&inode->i_mutex); -+ return ret; -+} -+ -+ -+int permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - int retval, submask; - -@@ -250,9 +269,9 @@ int permission(struct inode *inode, int - /* Ordinary permission routines do not understand MAY_APPEND. */ - submask = mask & ~MAY_APPEND; - if (inode->i_op && inode->i_op->permission) -- retval = inode->i_op->permission(inode, submask, nd); -+ retval = inode->i_op->permission(inode, submask, nd, perm); - else -- retval = generic_permission(inode, submask, NULL); -+ retval = generic_permission(inode, submask, NULL, perm); - if (retval) - return retval; - -@@ -269,9 +288,9 @@ int permission(struct inode *inode, int - * for filesystem access without changing the "normal" uids which - * are used for other things. - */ --int vfs_permission(struct nameidata *nd, int mask) -+int vfs_permission(struct nameidata *nd, int mask, struct exec_perm *perm) - { -- return permission(nd->dentry->d_inode, mask, nd); -+ return permission(nd->dentry->d_inode, mask, nd, perm); - } - - /** -@@ -288,7 +307,7 @@ int vfs_permission(struct nameidata *nd, - */ - int file_permission(struct file *file, int mask) - { -- return permission(file->f_dentry->d_inode, mask, NULL); -+ return permission(file->f_dentry->d_inode, mask, NULL, NULL); - } - - /* -@@ -704,7 +723,14 @@ static __always_inline void follow_dotdo - read_unlock(¤t->fs->lock); - break; - } -- read_unlock(¤t->fs->lock); -+#ifdef CONFIG_VE -+ if (nd->dentry == get_exec_env()->fs_root && -+ nd->mnt == get_exec_env()->fs_rootmnt) { -+ read_unlock(¤t->fs->lock); -+ break; -+ } -+#endif -+ read_unlock(¤t->fs->lock); - spin_lock(&dcache_lock); - if (nd->dentry != nd->mnt->mnt_root) { - nd->dentry = dget(nd->dentry->d_parent); -@@ -745,6 +771,10 @@ static int do_lookup(struct nameidata *n - if (dentry->d_op && dentry->d_op->d_revalidate) - goto need_revalidate; - done: -+ if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) { -+ dput(dentry); -+ return -ENOENT; -+ } - path->mnt = mnt; - path->dentry = dentry; - __follow_mount(path); -@@ -801,7 +831,7 @@ static fastcall int __link_path_walk(con - nd->flags |= LOOKUP_CONTINUE; - err = exec_permission_lite(inode, nd); - if (err == -EAGAIN) -- err = vfs_permission(nd, MAY_EXEC); -+ err = vfs_permission(nd, MAY_EXEC, NULL); - if (err) - break; - -@@ -864,6 +894,9 @@ static fastcall int __link_path_walk(con - goto out_dput; - - if (inode->i_op->follow_link) { -+ err = -ENOENT; -+ if (lookup_flags & LOOKUP_STRICT) -+ goto out_dput; - err = do_follow_link(&next, nd); - if (err) - goto return_err; -@@ -911,6 +944,7 @@ last_component: - break; - inode = next.dentry->d_inode; - if ((lookup_flags & LOOKUP_FOLLOW) -+ && !(lookup_flags & LOOKUP_STRICT) - && inode && inode->i_op && inode->i_op->follow_link) { - err = do_follow_link(&next, nd); - if (err) -@@ -951,6 +985,11 @@ return_reval: - break; - } - return_base: -+ if (!(nd->flags & LOOKUP_NOAREACHECK)) { -+ err = check_area_access_ve(nd->dentry, nd->mnt); -+ if (err) -+ break; -+ } - return 0; - out_dput: - dput_path(&next, nd); -@@ -1219,7 +1258,7 @@ static struct dentry * __lookup_hash(str - int err; - - inode = base->d_inode; -- err = permission(inode, MAY_EXEC, nd); -+ err = permission(inode, MAY_EXEC, nd, NULL); - dentry = ERR_PTR(err); - if (err) - goto out; -@@ -1354,7 +1393,7 @@ static int may_delete(struct inode *dir, - - BUG_ON(victim->d_parent->d_inode != dir); - -- error = permission(dir,MAY_WRITE | MAY_EXEC, NULL); -+ error = permission(dir,MAY_WRITE | MAY_EXEC, NULL, NULL); - if (error) - return error; - if (IS_APPEND(dir)) -@@ -1391,7 +1430,7 @@ static inline int may_create(struct inod - return -EEXIST; - if (IS_DEADDIR(dir)) - return -ENOENT; -- return permission(dir,MAY_WRITE | MAY_EXEC, nd); -+ return permission(dir,MAY_WRITE | MAY_EXEC, nd, NULL); - } - - /* -@@ -1491,7 +1530,7 @@ int may_open(struct nameidata *nd, int a - if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE)) - return -EISDIR; - -- error = vfs_permission(nd, acc_mode); -+ error = vfs_permission(nd, acc_mode, NULL); - if (error) - return error; - -@@ -1851,6 +1890,7 @@ asmlinkage long sys_mknod(const char __u - { - return sys_mknodat(AT_FDCWD, filename, mode, dev); - } -+EXPORT_SYMBOL_GPL(sys_mknod); - - int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) - { -@@ -1909,6 +1949,7 @@ asmlinkage long sys_mkdir(const char __u - { - return sys_mkdirat(AT_FDCWD, pathname, mode); - } -+EXPORT_SYMBOL_GPL(sys_mkdir); - - /* - * We try to drop the dentry early: we should have -@@ -2016,6 +2057,7 @@ asmlinkage long sys_rmdir(const char __u - { - return do_rmdir(AT_FDCWD, pathname); - } -+EXPORT_SYMBOL_GPL(sys_rmdir); - - int vfs_unlink(struct inode *dir, struct dentry *dentry) - { -@@ -2115,6 +2157,7 @@ asmlinkage long sys_unlink(const char __ - { - return do_unlinkat(AT_FDCWD, pathname); - } -+EXPORT_SYMBOL_GPL(sys_unlink); - - int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode) - { -@@ -2313,7 +2356,7 @@ static int vfs_rename_dir(struct inode * - * we'll need to flip '..'. - */ - if (new_dir != old_dir) { -- error = permission(old_dentry->d_inode, MAY_WRITE, NULL); -+ error = permission(old_dentry->d_inode, MAY_WRITE, NULL, NULL); - if (error) - return error; - } -@@ -2380,6 +2423,9 @@ int vfs_rename(struct inode *old_dir, st - int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); - const char *old_name; - -+ if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir)) -+ return -EXDEV; -+ - if (old_dentry->d_inode == new_dentry->d_inode) - return 0; - -diff -upr linux-2.6.16.orig/fs/namespace.c linux-2.6.16-026test009/fs/namespace.c ---- linux-2.6.16.orig/fs/namespace.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/namespace.c 2006-04-19 15:02:12.000000000 +0400 -@@ -40,13 +40,15 @@ static inline int sysfs_init(void) - - /* spinlock for vfsmount related operations, inplace of dcache_lock */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); -+EXPORT_SYMBOL(vfsmount_lock); - - static int event; - - static struct list_head *mount_hashtable; - static int hash_mask __read_mostly, hash_bits __read_mostly; - static kmem_cache_t *mnt_cache; --static struct rw_semaphore namespace_sem; -+struct rw_semaphore namespace_sem; -+EXPORT_SYMBOL(namespace_sem); - - /* /sys/fs */ - decl_subsys(fs, NULL, NULL); -@@ -371,10 +373,32 @@ static int show_vfsmnt(struct seq_file * - { 0, NULL } - }; - struct proc_fs_info *fs_infop; -+ char *path_buf, *path; - -- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); -+ /* skip FS_NOMOUNT mounts (rootfs) */ -+ if (mnt->mnt_sb->s_flags & MS_NOUSER) -+ return 0; -+ -+ path_buf = (char *) __get_free_page(GFP_KERNEL); -+ if (!path_buf) -+ return -ENOMEM; -+ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); -+ if (IS_ERR(path)) { -+ free_page((unsigned long) path_buf); -+ /* -+ * This means that the file position will be incremented, i.e. -+ * the total number of "invisible" vfsmnt will leak. -+ */ -+ return 0; -+ } -+ -+ if (ve_is_super(get_exec_env())) -+ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); -+ else -+ mangle(m, mnt->mnt_sb->s_type->name); - seq_putc(m, ' '); -- seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); -+ mangle(m, path); -+ free_page((unsigned long) path_buf); - seq_putc(m, ' '); - mangle(m, mnt->mnt_sb->s_type->name); - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); -@@ -474,6 +498,7 @@ void release_mounts(struct list_head *he - mntput(mnt); - } - } -+EXPORT_SYMBOL(release_mounts); - - void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) - { -@@ -498,6 +523,7 @@ void umount_tree(struct vfsmount *mnt, i - change_mnt_propagation(p, MS_PRIVATE); - } - } -+EXPORT_SYMBOL(umount_tree); - - static int do_umount(struct vfsmount *mnt, int flags) - { -@@ -608,7 +634,7 @@ asmlinkage long sys_umount(char __user * - goto dput_and_out; - - retval = -EPERM; -- if (!capable(CAP_SYS_ADMIN)) -+ if (!capable(CAP_VE_SYS_ADMIN)) - goto dput_and_out; - - retval = do_umount(nd.mnt, flags); -@@ -632,7 +658,7 @@ asmlinkage long sys_oldumount(char __use - - static int mount_is_safe(struct nameidata *nd) - { -- if (capable(CAP_SYS_ADMIN)) -+ if (capable(CAP_VE_SYS_ADMIN)) - return 0; - return -EPERM; - #ifdef notyet -@@ -642,7 +668,7 @@ static int mount_is_safe(struct nameidat - if (current->uid != nd->dentry->d_inode->i_uid) - return -EPERM; - } -- if (vfs_permission(nd, MAY_WRITE)) -+ if (vfs_permission(nd, MAY_WRITE, NULL)) - return -EPERM; - return 0; - #endif -@@ -917,7 +943,7 @@ static int do_remount(struct nameidata * - int err; - struct super_block *sb = nd->mnt->mnt_sb; - -- if (!capable(CAP_SYS_ADMIN)) -+ if (!capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - - if (!check_mnt(nd->mnt)) -@@ -951,7 +977,7 @@ static int do_move_mount(struct nameidat - struct nameidata old_nd, parent_nd; - struct vfsmount *p; - int err = 0; -- if (!capable(CAP_SYS_ADMIN)) -+ if (!capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - if (!old_name || !*old_name) - return -EINVAL; -@@ -1031,7 +1057,7 @@ static int do_new_mount(struct nameidata - return -EINVAL; - - /* we need capabilities... */ -- if (!capable(CAP_SYS_ADMIN)) -+ if (!capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - - mnt = do_kern_mount(type, flags, name, data); -@@ -1072,6 +1098,10 @@ int do_add_mount(struct vfsmount *newmnt - if ((err = graft_tree(newmnt, nd))) - goto unlock; - -+ if (newmnt->mnt_mountpoint->d_flags & DCACHE_VIRTUAL) -+ /* unaccessible yet - no lock */ -+ newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL; -+ - if (fslist) { - /* add to the specified expiration list */ - spin_lock(&vfsmount_lock); -@@ -1469,6 +1499,7 @@ out1: - free_page(type_page); - return retval; - } -+EXPORT_SYMBOL_GPL(sys_mount); - - /* - * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. -@@ -1520,7 +1551,7 @@ static void chroot_fs_refs(struct nameid - struct fs_struct *fs; - - read_lock(&tasklist_lock); -- do_each_thread(g, p) { -+ do_each_thread_ve(g, p) { - task_lock(p); - fs = p->fs; - if (fs) { -@@ -1535,7 +1566,7 @@ static void chroot_fs_refs(struct nameid - put_fs_struct(fs); - } else - task_unlock(p); -- } while_each_thread(g, p); -+ } while_each_thread_ve(g, p); - read_unlock(&tasklist_lock); - } - -@@ -1688,10 +1719,10 @@ static void __init init_mount_tree(void) - - init_task.namespace = namespace; - read_lock(&tasklist_lock); -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - get_namespace(namespace); - p->namespace = namespace; -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - read_unlock(&tasklist_lock); - - set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); -@@ -1707,7 +1738,8 @@ void __init mnt_init(unsigned long mempa - init_rwsem(&namespace_sem); - - mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), -- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); -+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, -+ NULL, NULL); - - mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - -diff -upr linux-2.6.16.orig/fs/nfs/dir.c linux-2.6.16-026test009/fs/nfs/dir.c ---- linux-2.6.16.orig/fs/nfs/dir.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/nfs/dir.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1635,7 +1635,8 @@ out: - return -EACCES; - } - --int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) -+int nfs_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - struct rpc_cred *cred; - int res = 0; -@@ -1683,7 +1684,7 @@ out: - out_notsup: - res = nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (res == 0) -- res = generic_permission(inode, mask, NULL); -+ res = generic_permission(inode, mask, NULL, perm); - unlock_kernel(); - return res; - } -diff -upr linux-2.6.16.orig/fs/nfs/nfsroot.c linux-2.6.16-026test009/fs/nfs/nfsroot.c ---- linux-2.6.16.orig/fs/nfs/nfsroot.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/nfs/nfsroot.c 2006-04-19 15:02:12.000000000 +0400 -@@ -312,7 +312,7 @@ static int __init root_nfs_name(char *na - /* Override them by options set on kernel command-line */ - root_nfs_parse(name, buf); - -- cp = system_utsname.nodename; -+ cp = ve_utsname.nodename; - if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { - printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); - return -1; -diff -upr linux-2.6.16.orig/fs/nfsd/nfs3proc.c linux-2.6.16-026test009/fs/nfsd/nfs3proc.c ---- linux-2.6.16.orig/fs/nfsd/nfs3proc.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/nfsd/nfs3proc.c 2006-04-19 15:02:11.000000000 +0400 -@@ -682,7 +682,7 @@ static struct svc_procedure nfsd_proced - PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT), - PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1), - PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4), -- PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE), -+ PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4), - PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4), - PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), - PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), -diff -upr linux-2.6.16.orig/fs/nfsd/nfs4proc.c linux-2.6.16-026test009/fs/nfsd/nfs4proc.c ---- linux-2.6.16.orig/fs/nfsd/nfs4proc.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/nfsd/nfs4proc.c 2006-04-19 15:02:11.000000000 +0400 -@@ -975,7 +975,7 @@ struct nfsd4_voidargs { int dummy; }; - */ - static struct svc_procedure nfsd_procedures4[2] = { - PROC(null, void, void, void, RC_NOCACHE, 1), -- PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE) -+ PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4) - }; - - struct svc_version nfsd_version4 = { -diff -upr linux-2.6.16.orig/fs/nfsd/nfsfh.c linux-2.6.16-026test009/fs/nfsd/nfsfh.c ---- linux-2.6.16.orig/fs/nfsd/nfsfh.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/nfsd/nfsfh.c 2006-04-19 15:02:11.000000000 +0400 -@@ -56,7 +56,7 @@ static int nfsd_acceptable(void *expv, s - /* make sure parents give x permission to user */ - int err; - parent = dget_parent(tdentry); -- err = permission(parent->d_inode, MAY_EXEC, NULL); -+ err = permission(parent->d_inode, MAY_EXEC, NULL, NULL); - if (err < 0) { - dput(parent); - break; -diff -upr linux-2.6.16.orig/fs/nfsd/nfsproc.c linux-2.6.16-026test009/fs/nfsd/nfsproc.c ---- linux-2.6.16.orig/fs/nfsd/nfsproc.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/nfsd/nfsproc.c 2006-04-19 15:02:11.000000000 +0400 -@@ -553,7 +553,7 @@ static struct svc_procedure nfsd_proced - PROC(none, void, void, none, RC_NOCACHE, ST), - PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), - PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), -- PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE), -+ PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), - PROC(none, void, void, none, RC_NOCACHE, ST), - PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), - PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), -diff -upr linux-2.6.16.orig/fs/nfsd/vfs.c linux-2.6.16-026test009/fs/nfsd/vfs.c ---- linux-2.6.16.orig/fs/nfsd/vfs.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/nfsd/vfs.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1817,12 +1817,13 @@ nfsd_permission(struct svc_export *exp, - inode->i_uid == current->fsuid) - return 0; - -- err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL); -+ err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), -+ NULL, NULL); - - /* Allow read access to binaries even when mode 111 */ - if (err == -EACCES && S_ISREG(inode->i_mode) && - acc == (MAY_READ | MAY_OWNER_OVERRIDE)) -- err = permission(inode, MAY_EXEC, NULL); -+ err = permission(inode, MAY_EXEC, NULL, NULL); - - return err? nfserrno(err) : 0; - } -diff -upr linux-2.6.16.orig/fs/ntfs/super.c linux-2.6.16-026test009/fs/ntfs/super.c ---- linux-2.6.16.orig/fs/ntfs/super.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/ntfs/super.c 2006-04-19 15:02:11.000000000 +0400 -@@ -3033,7 +3033,7 @@ iput_tmp_ino_err_out_now: - * method again... FIXME: Do we need to do this twice now because of - * attribute inodes? I think not, so leave as is for now... (AIA) - */ -- if (invalidate_inodes(sb)) { -+ if (invalidate_inodes(sb, 0)) { - ntfs_error(sb, "Busy inodes left. This is most likely a NTFS " - "driver bug."); - /* Copied from fs/super.c. I just love this message. (-; */ -diff -upr linux-2.6.16.orig/fs/open.c linux-2.6.16-026test009/fs/open.c ---- linux-2.6.16.orig/fs/open.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/open.c 2006-04-19 15:02:12.000000000 +0400 -@@ -25,6 +25,7 @@ - #include <linux/fs.h> - #include <linux/personality.h> - #include <linux/pagemap.h> -+#include <linux/faudit.h> - #include <linux/syscalls.h> - #include <linux/rcupdate.h> - -@@ -121,6 +122,34 @@ static int vfs_statfs64(struct super_blo - return 0; - } - -+static int faudit_statfs(struct vfsmount *mnt, struct dentry *dentry, -+ struct statfs *buf) -+{ -+ struct faudit_stat_arg arg; -+ -+ arg.mnt = mnt; -+ arg.dentry = dentry; -+ arg.stat = buf; -+ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg) -+ != NOTIFY_DONE) -+ return arg.err; -+ return 0; -+} -+ -+static int faudit_statfs64(struct vfsmount *mnt, struct dentry *dentry, -+ struct statfs64 *buf) -+{ -+ struct faudit_stat_arg arg; -+ -+ arg.mnt = mnt; -+ arg.dentry = dentry; -+ arg.stat = buf; -+ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS64, -+ &arg) != NOTIFY_DONE) -+ return arg.err; -+ return 0; -+} -+ - asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf) - { - struct nameidata nd; -@@ -130,6 +159,8 @@ asmlinkage long sys_statfs(const char __ - if (!error) { - struct statfs tmp; - error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp); -+ if (!error) -+ error = faudit_statfs(nd.mnt, nd.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_release(&nd); -@@ -149,6 +180,8 @@ asmlinkage long sys_statfs64(const char - if (!error) { - struct statfs64 tmp; - error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp); -+ if (!error) -+ error = faudit_statfs64(nd.mnt, nd.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_release(&nd); -@@ -168,6 +201,8 @@ asmlinkage long sys_fstatfs(unsigned int - if (!file) - goto out; - error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp); -+ if (!error) -+ error = faudit_statfs(file->f_vfsmnt, file->f_dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -@@ -189,6 +224,8 @@ asmlinkage long sys_fstatfs64(unsigned i - if (!file) - goto out; - error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp); -+ if (!error) -+ error = faudit_statfs64(file->f_vfsmnt, file->f_dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -@@ -243,7 +280,7 @@ static long do_sys_truncate(const char _ - if (!S_ISREG(inode->i_mode)) - goto dput_and_out; - -- error = vfs_permission(&nd, MAY_WRITE); -+ error = vfs_permission(&nd, MAY_WRITE, NULL); - if (error) - goto dput_and_out; - -@@ -397,7 +434,7 @@ asmlinkage long sys_utime(char __user * - goto dput_and_out; - - if (current->fsuid != inode->i_uid && -- (error = vfs_permission(&nd, MAY_WRITE)) != 0) -+ (error = vfs_permission(&nd, MAY_WRITE, NULL)) != 0) - goto dput_and_out; - } - mutex_lock(&inode->i_mutex); -@@ -450,7 +487,7 @@ long do_utimes(int dfd, char __user *fil - goto dput_and_out; - - if (current->fsuid != inode->i_uid && -- (error = vfs_permission(&nd, MAY_WRITE)) != 0) -+ (error = vfs_permission(&nd, MAY_WRITE, NULL)) != 0) - goto dput_and_out; - } - mutex_lock(&inode->i_mutex); -@@ -514,7 +551,7 @@ asmlinkage long sys_faccessat(int dfd, c - - res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); - if (!res) { -- res = vfs_permission(&nd, mode); -+ res = vfs_permission(&nd, mode, NULL); - /* SuS v2 requires we report a read only fs too */ - if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) - && !special_file(nd.dentry->d_inode->i_mode)) -@@ -543,7 +580,7 @@ asmlinkage long sys_chdir(const char __u - if (error) - goto out; - -- error = vfs_permission(&nd, MAY_EXEC); -+ error = vfs_permission(&nd, MAY_EXEC, NULL); - if (error) - goto dput_and_out; - -@@ -594,7 +631,7 @@ asmlinkage long sys_chroot(const char __ - if (error) - goto out; - -- error = vfs_permission(&nd, MAY_EXEC); -+ error = vfs_permission(&nd, MAY_EXEC, NULL); - if (error) - goto dput_and_out; - -@@ -733,6 +770,7 @@ asmlinkage long sys_chown(const char __u - } - return error; - } -+EXPORT_SYMBOL_GPL(sys_chown); - - asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, - gid_t group, int flag) -diff -upr linux-2.6.16.orig/fs/partitions/check.c linux-2.6.16-026test009/fs/partitions/check.c ---- linux-2.6.16.orig/fs/partitions/check.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/partitions/check.c 2006-04-19 15:02:12.000000000 +0400 -@@ -128,6 +128,7 @@ char *disk_name(struct gendisk *hd, int - - return buf; - } -+EXPORT_SYMBOL(disk_name); - - const char *bdevname(struct block_device *bdev, char *buf) - { -@@ -345,6 +346,7 @@ static char *make_block_name(struct gend - char *name; - static char *block_str = "block:"; - int size; -+ char *s; - - size = strlen(block_str) + strlen(disk->disk_name) + 1; - name = kmalloc(size, GFP_KERNEL); -@@ -352,6 +354,10 @@ static char *make_block_name(struct gend - return NULL; - strcpy(name, block_str); - strcat(name, disk->disk_name); -+ /* ewww... some of these buggers have / in name... */ -+ s = strchr(name, '/'); -+ if (s) -+ *s = '!'; - return name; - } - -diff -upr linux-2.6.16.orig/fs/pipe.c linux-2.6.16-026test009/fs/pipe.c ---- linux-2.6.16.orig/fs/pipe.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/pipe.c 2006-04-19 15:02:12.000000000 +0400 -@@ -797,6 +797,7 @@ close_f1: - no_files: - return error; - } -+EXPORT_SYMBOL_GPL(do_pipe); - - /* - * pipefs should _never_ be mounted by userland - too much of security hassle, -diff -upr linux-2.6.16.orig/fs/proc/array.c linux-2.6.16-026test009/fs/proc/array.c ---- linux-2.6.16.orig/fs/proc/array.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/array.c 2006-04-19 15:02:12.000000000 +0400 -@@ -76,6 +76,8 @@ - #include <linux/cpuset.h> - #include <linux/rcupdate.h> - -+#include <ub/beancounter.h> -+ - #include <asm/uaccess.h> - #include <asm/pgtable.h> - #include <asm/io.h> -@@ -161,8 +163,13 @@ static inline char * task_state(struct t - struct group_info *group_info; - int g; - struct fdtable *fdt = NULL; -+ pid_t pid, ppid, tgid; -+ -+ pid = get_task_pid(p); -+ tgid = get_task_tgid(p); - - read_lock(&tasklist_lock); -+ ppid = get_task_ppid(p); - buffer += sprintf(buffer, - "State:\t%s\n" - "SleepAVG:\t%lu%%\n" -@@ -174,9 +181,9 @@ static inline char * task_state(struct t - "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), -- p->tgid, -- p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, -- pid_alive(p) && p->ptrace ? p->parent->pid : 0, -+ tgid, -+ pid, ppid, -+ pid_alive(p) && p->ptrace ? get_task_pid(p->parent) : 0, - p->uid, p->euid, p->suid, p->fsuid, - p->gid, p->egid, p->sgid, p->fsgid); - read_unlock(&tasklist_lock); -@@ -199,6 +206,18 @@ static inline char * task_state(struct t - put_group_info(group_info); - - buffer += sprintf(buffer, "\n"); -+ -+#ifdef CONFIG_VE -+ buffer += sprintf(buffer, -+ "envID:\t%d\n" -+ "VPid:\t%d\n" -+ "PNState:\t%u\n" -+ "StopState:\t%u\n", -+ VE_TASK_INFO(p)->owner_env->veid, -+ virt_pid(p), -+ p->pn_state, -+ p->stopped_state); -+#endif - return buffer; - } - -@@ -244,7 +263,7 @@ static void collect_sigign_sigcatch(stru - - static inline char * task_sig(struct task_struct *p, char *buffer) - { -- sigset_t pending, shpending, blocked, ignored, caught; -+ sigset_t pending, shpending, blocked, ignored, caught, saved; - int num_threads = 0; - unsigned long qsize = 0; - unsigned long qlim = 0; -@@ -254,6 +273,7 @@ static inline char * task_sig(struct tas - sigemptyset(&blocked); - sigemptyset(&ignored); - sigemptyset(&caught); -+ sigemptyset(&saved); - - /* Gather all the data with the appropriate locks held */ - read_lock(&tasklist_lock); -@@ -262,6 +282,7 @@ static inline char * task_sig(struct tas - pending = p->pending.signal; - shpending = p->signal->shared_pending.signal; - blocked = p->blocked; -+ saved = p->saved_sigmask; - collect_sigign_sigcatch(p, &ignored, &caught); - num_threads = atomic_read(&p->signal->count); - qsize = atomic_read(&p->user->sigpending); -@@ -279,6 +300,7 @@ static inline char * task_sig(struct tas - buffer = render_sigset_t("SigBlk:\t", &blocked, buffer); - buffer = render_sigset_t("SigIgn:\t", &ignored, buffer); - buffer = render_sigset_t("SigCgt:\t", &caught, buffer); -+ buffer = render_sigset_t("SigSvd:\t", &saved, buffer); - - return buffer; - } -@@ -293,10 +315,27 @@ static inline char *task_cap(struct task - cap_t(p->cap_effective)); - } - -+#ifdef CONFIG_USER_RESOURCE -+static inline void ub_dump_task_info(struct task_struct *tsk, -+ char *stsk, int ltsk, char *smm, int lmm) -+{ -+ print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk); -+ task_lock(tsk); -+ if (tsk->mm) -+ print_ub_uid(tsk->mm->mm_ub, smm, lmm); -+ else -+ strncpy(smm, "N/A", lmm); -+ task_unlock(tsk); -+} -+#endif -+ - int proc_pid_status(struct task_struct *task, char * buffer) - { - char * orig = buffer; - struct mm_struct *mm = get_task_mm(task); -+#ifdef CONFIG_USER_RESOURCE -+ char tsk_ub_info[64], mm_ub_info[64]; -+#endif - - buffer = task_name(task, buffer); - buffer = task_state(task, buffer); -@@ -311,6 +350,14 @@ int proc_pid_status(struct task_struct * - #if defined(CONFIG_S390) - buffer = task_show_regs(task, buffer); - #endif -+#ifdef CONFIG_USER_RESOURCE -+ ub_dump_task_info(task, -+ tsk_ub_info, sizeof(tsk_ub_info), -+ mm_ub_info, sizeof(mm_ub_info)); -+ -+ buffer += sprintf(buffer, "TaskUB:\t%s\n", tsk_ub_info); -+ buffer += sprintf(buffer, "MMUB:\t%s\n", mm_ub_info); -+#endif - return buffer - orig; - } - -@@ -333,6 +380,10 @@ static int do_task_stat(struct task_stru - DEFINE_KTIME(it_real_value); - struct task_struct *t; - char tcomm[sizeof(task->comm)]; -+#ifdef CONFIG_USER_RESOURCE -+ char ub_task_info[64]; -+ char ub_mm_info[64]; -+#endif - - state = *get_task_state(task); - vsize = eip = esp = 0; -@@ -370,11 +421,12 @@ static int do_task_stat(struct task_stru - } - if (task->signal) { - if (task->signal->tty) { -- tty_pgrp = task->signal->tty->pgrp; -+ tty_pgrp = pid_type_to_vpid(PIDTYPE_PGID, -+ task->signal->tty->pgrp); - tty_nr = new_encode_dev(tty_devnum(task->signal->tty)); - } -- pgid = process_group(task); -- sid = task->signal->session; -+ pgid = get_task_pgid(task); -+ sid = get_task_sid(task); - cmin_flt = task->signal->cmin_flt; - cmaj_flt = task->signal->cmaj_flt; - cutime = task->signal->cutime; -@@ -388,7 +440,7 @@ static int do_task_stat(struct task_stru - } - it_real_value = task->signal->real_timer.expires; - } -- ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; -+ ppid = get_task_ppid(task); - read_unlock(&tasklist_lock); - - if (!whole || num_threads<2) -@@ -407,14 +459,34 @@ static int do_task_stat(struct task_stru - - /* Temporary variable needed for gcc-2.96 */ - /* convert timespec -> nsec*/ -+#ifndef CONFIG_VE - start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC - + task->start_time.tv_nsec; -+#else -+ start_time = (unsigned long long)(task->start_time.tv_sec - -+ get_exec_env()->init_entry->start_time.tv_sec) * -+ NSEC_PER_SEC + task->start_time.tv_nsec - -+ get_exec_env()->init_entry->start_time.tv_nsec; -+#endif - /* convert nsec -> ticks */ - start_time = nsec_to_clock_t(start_time); - -+#ifdef CONFIG_USER_RESOURCE -+ ub_dump_task_info(task, -+ ub_task_info, sizeof(ub_task_info), -+ ub_mm_info, sizeof(ub_mm_info)); -+#endif -+ - res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ - %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \ --%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", -+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu" -+#ifdef CONFIG_VE -+"0 0 0 0 0 0 0 0 %d %u" -+#endif -+#ifdef CONFIG_USER_RESOURCE -+ " %s %s" -+#endif -+ "\n", - task->pid, - tcomm, - state, -@@ -459,7 +531,16 @@ static int do_task_stat(struct task_stru - task->exit_signal, - task_cpu(task), - task->rt_priority, -- task->policy); -+ task->policy -+#ifdef CONFIG_VE -+ , virt_pid(task), -+ VEID(VE_TASK_INFO(task)->owner_env) -+#endif -+#ifdef CONFIG_USER_RESOURCE -+ , ub_task_info, -+ ub_mm_info -+#endif -+ ); - if(mm) - mmput(mm); - return res; -diff -upr linux-2.6.16.orig/fs/proc/base.c linux-2.6.16-026test009/fs/proc/base.c ---- linux-2.6.16.orig/fs/proc/base.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/base.c 2006-04-19 15:02:12.000000000 +0400 -@@ -291,22 +291,25 @@ static int proc_fd_link(struct inode *in - struct files_struct *files; - struct file *file; - int fd = proc_type(inode) - PROC_TID_FD_DIR; -+ int err = -ENOENT; - - files = get_files_struct(task); - if (files) { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { -- *mnt = mntget(file->f_vfsmnt); -- *dentry = dget(file->f_dentry); -- rcu_read_unlock(); -- put_files_struct(files); -- return 0; -+ if (d_root_check(file->f_dentry, file->f_vfsmnt)) { -+ err = -EACCES; -+ } else { -+ *mnt = mntget(file->f_vfsmnt); -+ *dentry = dget(file->f_dentry); -+ err = 0; -+ } - } - rcu_read_unlock(); - put_files_struct(files); - } -- return -ENOENT; -+ return err; - } - - static struct fs_struct *get_fs_struct(struct task_struct *task) -@@ -326,10 +329,12 @@ static int proc_cwd_link(struct inode *i - int result = -ENOENT; - if (fs) { - read_lock(&fs->lock); -- *mnt = mntget(fs->pwdmnt); -- *dentry = dget(fs->pwd); -+ result = d_root_check(fs->pwd, fs->pwdmnt); -+ if (!result) { -+ *mnt = mntget(fs->pwdmnt); -+ *dentry = dget(fs->pwd); -+ } - read_unlock(&fs->lock); -- result = 0; - put_fs_struct(fs); - } - return result; -@@ -579,19 +584,21 @@ static int proc_check_root(struct inode - return proc_check_chroot(root, vfsmnt); - } - --static int proc_permission(struct inode *inode, int mask, struct nameidata *nd) -+static int proc_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { -- if (generic_permission(inode, mask, NULL) != 0) -+ if (generic_permission(inode, mask, NULL, perm) != 0) - return -EACCES; - return proc_check_root(inode); - } - --static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd) -+static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - struct dentry *root; - struct vfsmount *vfsmnt; - -- if (generic_permission(inode, mask, NULL) != 0) -+ if (generic_permission(inode, mask, NULL, perm) != 0) - return -EACCES; - - if (proc_task_root_link(inode, &root, &vfsmnt)) -@@ -1303,6 +1310,10 @@ static struct inode *proc_pid_make_inode - struct inode * inode; - struct proc_inode *ei; - -+ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, -+ VE_OWNER_FSTYPE(sb->s_type))) -+ return NULL; -+ - /* We need a new inode */ - - inode = new_inode(sb); -@@ -1406,6 +1417,10 @@ static void pid_base_iput(struct dentry - spin_lock(&task->proc_lock); - if (task->proc_dentry == dentry) - task->proc_dentry = NULL; -+#ifdef CONFIG_VE -+ if (VE_TASK_INFO(task)->glob_proc_dentry == dentry) -+ VE_TASK_INFO(task)->glob_proc_dentry = NULL; -+#endif - spin_unlock(&task->proc_lock); - iput(inode); - } -@@ -1879,14 +1894,14 @@ static int proc_self_readlink(struct den - int buflen) - { - char tmp[30]; -- sprintf(tmp, "%d", current->tgid); -+ sprintf(tmp, "%d", get_task_tgid(current)); - return vfs_readlink(dentry,buffer,buflen,tmp); - } - - static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) - { - char tmp[30]; -- sprintf(tmp, "%d", current->tgid); -+ sprintf(tmp, "%d", get_task_tgid(current)); - return ERR_PTR(vfs_follow_link(nd,tmp)); - } - -@@ -1911,11 +1926,8 @@ static struct inode_operations proc_self - * of PIDTYPE_PID. - */ - --struct dentry *proc_pid_unhash(struct task_struct *p) -+struct dentry *__proc_pid_unhash(struct task_struct *p, struct dentry *proc_dentry) - { -- struct dentry *proc_dentry; -- -- proc_dentry = p->proc_dentry; - if (proc_dentry != NULL) { - - spin_lock(&dcache_lock); -@@ -1933,6 +1945,14 @@ struct dentry *proc_pid_unhash(struct ta - return proc_dentry; - } - -+void proc_pid_unhash(struct task_struct *p, struct dentry *pd[2]) -+{ -+ pd[0] = __proc_pid_unhash(p, p->proc_dentry); -+#ifdef CONFIG_VE -+ pd[1] = __proc_pid_unhash(p, VE_TASK_INFO(p)->glob_proc_dentry); -+#endif -+} -+ - /** - * proc_pid_flush - recover memory used by stale /proc/@pid/x entries - * @proc_dentry: directoy to prune. -@@ -1940,7 +1960,7 @@ struct dentry *proc_pid_unhash(struct ta - * Shrink the /proc directory that was used by the just killed thread. - */ - --void proc_pid_flush(struct dentry *proc_dentry) -+void __proc_pid_flush(struct dentry *proc_dentry) - { - might_sleep(); - if(proc_dentry != NULL) { -@@ -1949,12 +1969,21 @@ void proc_pid_flush(struct dentry *proc_ - } - } - -+void proc_pid_flush(struct dentry *proc_dentry[2]) -+{ -+ __proc_pid_flush(proc_dentry[0]); -+#ifdef CONFIG_VE -+ __proc_pid_flush(proc_dentry[1]); -+#endif -+} -+ - /* SMP-safe */ - struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) - { - struct task_struct *task; - struct inode *inode; - struct proc_inode *ei; -+ struct dentry *pd[2]; - unsigned tgid; - int died; - -@@ -1978,7 +2007,19 @@ struct dentry *proc_pid_lookup(struct in - goto out; - - read_lock(&tasklist_lock); -- task = find_task_by_pid(tgid); -+ task = find_task_by_pid_ve(tgid); -+ /* In theory we are allowed to lookup both /proc/VIRT_PID and -+ * /proc/GLOBAL_PID inside VE. However, current /proc implementation -+ * cannot maintain two references to one task, so that we have -+ * to prohibit /proc/GLOBAL_PID. -+ */ -+ if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tgid)) { -+ /* However, VE_ENTERed tasks are exception, they use global -+ * pids. -+ */ -+ if (virt_pid(task) != tgid) -+ task = NULL; -+ } - if (task) - get_task_struct(task); - read_unlock(&tasklist_lock); -@@ -2007,16 +2048,23 @@ struct dentry *proc_pid_lookup(struct in - died = 0; - d_add(dentry, inode); - spin_lock(&task->proc_lock); -+#ifdef CONFIG_VE -+ if (ve_is_super(VE_OWNER_FSTYPE(inode->i_sb->s_type))) -+ VE_TASK_INFO(task)->glob_proc_dentry = dentry; -+ else -+ task->proc_dentry = dentry; -+#else - task->proc_dentry = dentry; -+#endif - if (!pid_alive(task)) { -- dentry = proc_pid_unhash(task); -+ proc_pid_unhash(task, pd); - died = 1; - } - spin_unlock(&task->proc_lock); - - put_task_struct(task); - if (died) { -- proc_pid_flush(dentry); -+ proc_pid_flush(pd); - goto out; - } - return NULL; -@@ -2037,7 +2085,12 @@ static struct dentry *proc_task_lookup(s - goto out; - - read_lock(&tasklist_lock); -- task = find_task_by_pid(tid); -+ task = find_task_by_pid_ve(tid); -+ /* See comment above in similar place. */ -+ if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tid)) { -+ if (virt_pid(task) != tid) -+ task = NULL; -+ } - if (task) - get_task_struct(task); - read_unlock(&tasklist_lock); -@@ -2081,7 +2134,8 @@ out: - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. - */ --static int get_tgid_list(int index, unsigned long version, unsigned int *tgids) -+static int get_tgid_list(int index, unsigned long version, unsigned int *tgids, -+ struct ve_struct *ve) - { - struct task_struct *p; - int nr_tgids = 0; -@@ -2090,7 +2144,11 @@ static int get_tgid_list(int index, unsi - read_lock(&tasklist_lock); - p = NULL; - if (version) { -- p = find_task_by_pid(version); -+ struct ve_struct *oldve; -+ -+ oldve = set_exec_env(ve); -+ p = find_task_by_pid_ve(version); -+ (void)set_exec_env(oldve); - if (p && !thread_group_leader(p)) - p = NULL; - } -@@ -2098,10 +2156,10 @@ static int get_tgid_list(int index, unsi - if (p) - index = 0; - else -- p = next_task(&init_task); -+ p = __first_task_ve(ve); - -- for ( ; p != &init_task; p = next_task(p)) { -- int tgid = p->pid; -+ for ( ; p != NULL; p = __next_task_ve(ve, p)) { -+ int tgid = get_task_pid_ve(p, ve); - if (!pid_alive(p)) - continue; - if (--index >= 0) -@@ -2134,7 +2192,7 @@ static int get_tid_list(int index, unsig - * via next_thread(). - */ - if (pid_alive(task)) do { -- int tid = task->pid; -+ int tid = get_task_pid(task); - - if (--index >= 0) - continue; -@@ -2171,7 +2229,8 @@ int proc_pid_readdir(struct file * filp, - next_tgid = filp->f_version; - filp->f_version = 0; - for (;;) { -- nr_tgids = get_tgid_list(nr, next_tgid, tgid_array); -+ nr_tgids = get_tgid_list(nr, next_tgid, tgid_array, -+ filp->f_dentry->d_sb->s_type->owner_env); - if (!nr_tgids) { - /* no more entries ! */ - break; -diff -upr linux-2.6.16.orig/fs/proc/generic.c linux-2.6.16-026test009/fs/proc/generic.c ---- linux-2.6.16.orig/fs/proc/generic.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/generic.c 2006-04-19 15:02:12.000000000 +0400 -@@ -10,7 +10,9 @@ - - #include <linux/errno.h> - #include <linux/time.h> -+#include <linux/fs.h> - #include <linux/proc_fs.h> -+#include <linux/ve_owner.h> - #include <linux/stat.h> - #include <linux/module.h> - #include <linux/mount.h> -@@ -29,6 +31,8 @@ static ssize_t proc_file_write(struct fi - size_t count, loff_t *ppos); - static loff_t proc_file_lseek(struct file *, loff_t, int); - -+static DEFINE_RWLOCK(proc_tree_lock); -+ - int proc_match(int len, const char *name, struct proc_dir_entry *de) - { - if (de->namelen != len) -@@ -229,6 +233,7 @@ proc_file_lseek(struct file *file, loff_ - return retval; - } - -+#ifndef CONFIG_VE - static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) - { - struct inode *inode = dentry->d_inode; -@@ -261,9 +266,12 @@ static int proc_getattr(struct vfsmount - generic_fillattr(inode, stat); - return 0; - } -+#endif - - static struct inode_operations proc_file_inode_operations = { -+#ifndef CONFIG_VE - .setattr = proc_notify_change, -+#endif - }; - - /* -@@ -271,14 +279,20 @@ static struct inode_operations proc_file - * returns the struct proc_dir_entry for "/proc/tty/driver", and - * returns "serial" in residual. - */ --static int xlate_proc_name(const char *name, -+static int __xlate_proc_name(struct proc_dir_entry *root, const char *name, - struct proc_dir_entry **ret, const char **residual) - { - const char *cp = name, *next; - struct proc_dir_entry *de; - int len; - -- de = &proc_root; -+ if (*ret) { -+ de_get(*ret); -+ return 0; -+ } -+ -+ read_lock(&proc_tree_lock); -+ de = root; - while (1) { - next = strchr(cp, '/'); - if (!next) -@@ -289,15 +303,35 @@ static int xlate_proc_name(const char *n - if (proc_match(len, cp, de)) - break; - } -- if (!de) -+ if (!de) { -+ read_unlock(&proc_tree_lock); - return -ENOENT; -+ } - cp += len + 1; - } - *residual = cp; -- *ret = de; -+ *ret = de_get(de); -+ read_unlock(&proc_tree_lock); - return 0; - } - -+#ifndef CONFIG_VE -+#define xlate_proc_loc_name xlate_proc_name -+#else -+static int xlate_proc_loc_name(const char *name, -+ struct proc_dir_entry **ret, const char **residual) -+{ -+ return __xlate_proc_name(get_exec_env()->proc_root, -+ name, ret, residual); -+} -+#endif -+ -+static int xlate_proc_name(const char *name, -+ struct proc_dir_entry **ret, const char **residual) -+{ -+ return __xlate_proc_name(&proc_root, name, ret, residual); -+} -+ - static DEFINE_IDR(proc_inum_idr); - static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ - -@@ -369,6 +403,20 @@ static struct dentry_operations proc_den - .d_delete = proc_delete_dentry, - }; - -+static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir, -+ struct dentry *d) -+{ -+ struct proc_dir_entry *de; -+ -+ for (de = dir->subdir; de; de = de->next) { -+ if (de->namelen != d->d_name.len) -+ continue; -+ if (!memcmp(d->d_name.name, de->name, de->namelen)) -+ break; -+ } -+ return de_get(de); -+} -+ - /* - * Don't create negative dentries here, return -ENOENT by hand - * instead. -@@ -376,34 +424,147 @@ static struct dentry_operations proc_den - struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) - { - struct inode *inode = NULL; -- struct proc_dir_entry * de; -+ struct proc_dir_entry *lde, *gde; - int error = -ENOENT; - - lock_kernel(); -- de = PDE(dir); -- if (de) { -- for (de = de->subdir; de ; de = de->next) { -- if (de->namelen != dentry->d_name.len) -- continue; -- if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { -- unsigned int ino = de->low_ino; -+ lde = LPDE(dir); - -- error = -EINVAL; -- inode = proc_get_inode(dir->i_sb, ino, de); -- break; -- } -- } -- } -+ if (!lde) -+ goto out; -+ -+ read_lock(&proc_tree_lock); -+ lde = __proc_lookup(lde, dentry); -+#ifdef CONFIG_VE -+ gde = GPDE(dir); -+ if (gde) -+ gde = __proc_lookup(gde, dentry); -+#else -+ gde = NULL; -+#endif -+ read_unlock(&proc_tree_lock); -+ -+ /* -+ * There are following possible cases after lookup: -+ * -+ * lde gde -+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ * NULL NULL ENOENT -+ * loc NULL found in local tree -+ * loc glob found in both trees -+ * NULL glob found in global tree -+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ * -+ * We initialized inode as follows after lookup: -+ * -+ * inode->lde inode->gde -+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ * loc NULL in local tree -+ * loc glob both trees -+ * glob glob global tree -+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ * i.e. inode->lde is always initialized -+ */ -+ -+ if (lde == NULL && gde == NULL) -+ goto out; -+ -+ if (lde != NULL) -+ inode = proc_get_inode(dir->i_sb, lde->low_ino, lde); -+ else -+ inode = proc_get_inode(dir->i_sb, gde->low_ino, gde); -+ -+ /* -+ * We can sleep in proc_get_inode(), but since we have i_sem -+ * being taken, no one can setup GPDE/LPDE on this inode. -+ */ -+ if (!inode) -+ goto out_put; -+ -+#ifdef CONFIG_VE -+ GPDE(inode) = de_get(gde); -+ if (gde) -+ __module_get(gde->owner); -+ -+ /* if dentry is found in both trees and it is a directory -+ * then inode's nlink count must be altered, because local -+ * and global subtrees may differ. -+ * on the other hand, they may intersect, so actual nlink -+ * value is difficult to calculate - upper estimate is used -+ * instead of it. -+ * dentry found in global tree only must not be writable -+ * in non-super ve. -+ */ -+ if (lde && gde && lde != gde && gde->nlink > 1) -+ inode->i_nlink += gde->nlink - 2; -+ if (lde == NULL && !ve_is_super( -+ VE_OWNER_FSTYPE(dir->i_sb->s_type))) -+ inode->i_mode &= ~S_IWUGO; -+#endif - unlock_kernel(); -+ dentry->d_op = &proc_dentry_operations; -+ d_add(dentry, inode); -+ de_put(lde); -+ de_put(gde); -+ return NULL; - -- if (inode) { -- dentry->d_op = &proc_dentry_operations; -- d_add(dentry, inode); -- return NULL; -- } -+out_put: -+ de_put(lde); -+ de_put(gde); -+out: -+ unlock_kernel(); - return ERR_PTR(error); - } - -+struct proc_dir_reader { -+ struct list_head list; -+ struct proc_dir_entry *next; -+}; -+ -+static LIST_HEAD(proc_dir_readers); -+static DEFINE_SPINLOCK(proc_dir_readers_lock); -+ -+static inline void add_reader(struct proc_dir_reader *r, -+ struct proc_dir_entry *cur) -+{ -+ r->next = cur->next; -+ spin_lock(&proc_dir_readers_lock); -+ list_add(&r->list, &proc_dir_readers); -+ spin_unlock(&proc_dir_readers_lock); -+} -+ -+static inline struct proc_dir_entry *del_reader(struct proc_dir_reader *r) -+{ -+ spin_lock(&proc_dir_readers_lock); -+ list_del(&r->list); -+ spin_unlock(&proc_dir_readers_lock); -+ return r->next; -+} -+ -+static void notify_readers(struct proc_dir_entry *de) -+{ -+ struct proc_dir_reader *r; -+ -+ /* lockless since proc_tree_lock is taken for writing */ -+ list_for_each_entry(r, &proc_dir_readers, list) -+ if (r->next == de) -+ r->next = de->next; -+} -+ -+static inline int in_tree(struct proc_dir_entry *de, struct proc_dir_entry *dir) -+{ -+ struct proc_dir_entry *gde; -+ -+ for (gde = dir->subdir; gde; gde = gde->next) { -+ if (de->namelen != gde->namelen) -+ continue; -+ if (memcmp(de->name, gde->name, gde->namelen)) -+ continue; -+ return 1; -+ } -+ return 0; -+} -+ - /* - * This returns non-zero if at EOF, so that the /proc - * root directory can use this and check if it should -@@ -421,6 +582,7 @@ int proc_readdir(struct file * filp, - int i; - struct inode *inode = filp->f_dentry->d_inode; - int ret = 0; -+ struct proc_dir_reader this; - - lock_kernel(); - -@@ -447,13 +609,12 @@ int proc_readdir(struct file * filp, - filp->f_pos++; - /* fall through */ - default: -+ read_lock(&proc_tree_lock); - de = de->subdir; - i -= 2; - for (;;) { -- if (!de) { -- ret = 1; -- goto out; -- } -+ if (!de) -+ goto chk_global; - if (!i) - break; - de = de->next; -@@ -461,12 +622,60 @@ int proc_readdir(struct file * filp, - } - - do { -- if (filldir(dirent, de->name, de->namelen, filp->f_pos, -- de->low_ino, de->mode >> 12) < 0) -+ de_get(de); -+ add_reader(&this, de); -+ read_unlock(&proc_tree_lock); -+ ret = filldir(dirent, de->name, de->namelen, -+ filp->f_pos, de->low_ino, -+ de->mode >> 12); -+ read_lock(&proc_tree_lock); -+ de_put(de); -+ de = del_reader(&this); -+ if (ret < 0) { -+ read_unlock(&proc_tree_lock); -+ ret = 0; - goto out; -+ } - filp->f_pos++; -- de = de->next; - } while (de); -+chk_global: -+#ifdef CONFIG_VE -+ de = GPDE(inode); -+ if (de == NULL) -+ goto done; -+ -+ de = de->subdir; -+ while (de) { -+ if (in_tree(de, LPDE(inode))) { -+ de = de->next; -+ continue; -+ } -+ -+ if (i > 0) { -+ i--; -+ de = de->next; -+ continue; -+ } -+ -+ de_get(de); -+ add_reader(&this, de); -+ read_unlock(&proc_tree_lock); -+ ret = filldir(dirent, de->name, de->namelen, -+ filp->f_pos, de->low_ino, -+ de->mode >> 12); -+ read_lock(&proc_tree_lock); -+ de_put(de); -+ de = del_reader(&this); -+ if (ret < 0) { -+ read_unlock(&proc_tree_lock); -+ ret = 0; -+ goto out; -+ } -+ filp->f_pos++; -+ } -+done: -+#endif -+ read_unlock(&proc_tree_lock); - } - ret = 1; - out: unlock_kernel(); -@@ -488,8 +697,10 @@ static struct file_operations proc_dir_o - */ - static struct inode_operations proc_dir_inode_operations = { - .lookup = proc_lookup, -+#ifndef CONFIG_VE - .getattr = proc_getattr, - .setattr = proc_notify_change, -+#endif - }; - - static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) -@@ -499,10 +710,20 @@ static int proc_register(struct proc_dir - i = get_inode_number(); - if (i == 0) - return -EAGAIN; -+ -+ write_lock(&proc_tree_lock); -+ if (dir->deleted) { -+ write_unlock(&proc_tree_lock); -+ release_inode_number(i); -+ return -ENOENT; -+ } -+ - dp->low_ino = i; - dp->next = dir->subdir; -- dp->parent = dir; -+ dp->parent = de_get(dir); - dir->subdir = dp; -+ write_unlock(&proc_tree_lock); -+ - if (S_ISDIR(dp->mode)) { - if (dp->proc_iops == NULL) { - dp->proc_fops = &proc_dir_operations; -@@ -556,24 +777,26 @@ static struct proc_dir_entry *proc_creat - mode_t mode, - nlink_t nlink) - { -- struct proc_dir_entry *ent = NULL; -+ struct proc_dir_entry *ent; - const char *fn = name; - int len; - - /* make sure name is valid */ -- if (!name || !strlen(name)) goto out; -+ if (!name || !strlen(name)) -+ goto out; - -- if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0) -+ if (xlate_proc_loc_name(name, parent, &fn) != 0) - goto out; - - /* At this point there must not be any '/' characters beyond *fn */ - if (strchr(fn, '/')) -- goto out; -+ goto out_put; - - len = strlen(fn); - - ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); -- if (!ent) goto out; -+ if (!ent) -+ goto out_put; - - memset(ent, 0, sizeof(struct proc_dir_entry)); - memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); -@@ -581,8 +804,13 @@ static struct proc_dir_entry *proc_creat - ent->namelen = len; - ent->mode = mode; - ent->nlink = nlink; -- out: -+ atomic_set(&ent->count, 1); - return ent; -+ -+out_put: -+ de_put(*parent); -+out: -+ return NULL; - } - - struct proc_dir_entry *proc_symlink(const char *name, -@@ -606,6 +834,7 @@ struct proc_dir_entry *proc_symlink(cons - kfree(ent); - ent = NULL; - } -+ de_put(parent); - } - return ent; - } -@@ -624,6 +853,7 @@ struct proc_dir_entry *proc_mkdir_mode(c - kfree(ent); - ent = NULL; - } -+ de_put(parent); - } - return ent; - } -@@ -662,9 +892,28 @@ struct proc_dir_entry *create_proc_entry - kfree(ent); - ent = NULL; - } -+ de_put(parent); - } - return ent; - } -+EXPORT_SYMBOL(remove_proc_glob_entry); -+ -+struct proc_dir_entry *create_proc_glob_entry(const char *name, mode_t mode, -+ struct proc_dir_entry *parent) -+{ -+ const char *path; -+ struct proc_dir_entry *ent; -+ -+ path = name; -+ if (xlate_proc_name(path, &parent, &name) != 0) -+ return NULL; -+ -+ ent = create_proc_entry(name, mode, parent); -+ de_put(parent); -+ return ent; -+} -+ -+EXPORT_SYMBOL(create_proc_glob_entry); - - void free_proc_entry(struct proc_dir_entry *de) - { -@@ -684,20 +933,21 @@ void free_proc_entry(struct proc_dir_ent - * Remove a /proc entry and free it if it's not currently in use. - * If it is in use, we set the 'deleted' flag. - */ --void remove_proc_entry(const char *name, struct proc_dir_entry *parent) -+static void __remove_proc_entry(const char *name, struct proc_dir_entry *parent) - { - struct proc_dir_entry **p; - struct proc_dir_entry *de; - const char *fn = name; - int len; - -- if (!parent && xlate_proc_name(name, &parent, &fn) != 0) -- goto out; - len = strlen(fn); -+ write_lock(&proc_tree_lock); - for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (!proc_match(len, fn, *p)) - continue; -+ - de = *p; -+ notify_readers(de); - *p = de->next; - de->next = NULL; - if (S_ISDIR(de->mode)) -@@ -705,15 +955,43 @@ void remove_proc_entry(const char *name, - proc_kill_inodes(de); - de->nlink = 0; - WARN_ON(de->subdir); -- if (!atomic_read(&de->count)) -- free_proc_entry(de); -- else { -- de->deleted = 1; -- printk("remove_proc_entry: %s/%s busy, count=%d\n", -- parent->name, de->name, atomic_read(&de->count)); -- } -+ de->deleted = 1; -+ de_put(de); -+ de_put(parent); - break; - } --out: -- return; -+ write_unlock(&proc_tree_lock); -+} -+ -+void remove_proc_loc_entry(const char *name, struct proc_dir_entry *parent) -+{ -+ const char *path; -+ -+ path = name; -+ if (xlate_proc_loc_name(path, &parent, &name) != 0) -+ return; -+ -+ __remove_proc_entry(name, parent); -+ de_put(parent); -+} -+ -+void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent) -+{ -+ const char *path; -+ -+ path = name; -+ if (xlate_proc_name(path, &parent, &name) != 0) -+ return; -+ -+ __remove_proc_entry(name, parent); -+ de_put(parent); -+} -+ -+void remove_proc_entry(const char *name, struct proc_dir_entry *parent) -+{ -+ remove_proc_loc_entry(name, parent); -+#ifdef CONFIG_VE -+ if (ve_is_super(get_exec_env())) -+ remove_proc_glob_entry(name, parent); -+#endif - } -diff -upr linux-2.6.16.orig/fs/proc/inode.c linux-2.6.16-026test009/fs/proc/inode.c ---- linux-2.6.16.orig/fs/proc/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/inode.c 2006-04-19 15:02:12.000000000 +0400 -@@ -8,6 +8,7 @@ - #include <linux/proc_fs.h> - #include <linux/kernel.h> - #include <linux/mm.h> -+#include <linux/ve_owner.h> - #include <linux/string.h> - #include <linux/stat.h> - #include <linux/file.h> -@@ -21,34 +22,25 @@ - - #include "internal.h" - --static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) --{ -- if (de) -- atomic_inc(&de->count); -- return de; --} -- - /* - * Decrements the use count and checks for deferred deletion. - */ --static void de_put(struct proc_dir_entry *de) -+void de_put(struct proc_dir_entry *de) - { - if (de) { -- lock_kernel(); - if (!atomic_read(&de->count)) { - printk("de_put: entry %s already free!\n", de->name); -- unlock_kernel(); - return; - } - - if (atomic_dec_and_test(&de->count)) { -- if (de->deleted) { -- printk("de_put: deferred delete of %s\n", -+ if (unlikely(!de->deleted)) { -+ printk("de_put: early delete of %s\n", - de->name); -- free_proc_entry(de); -+ return; - } -+ free_proc_entry(de); - } -- unlock_kernel(); - } - } - -@@ -68,12 +60,19 @@ static void proc_delete_inode(struct ino - put_task_struct(tsk); - - /* Let go of any associated proc directory entry */ -- de = PROC_I(inode)->pde; -+ de = LPDE(inode); - if (de) { - if (de->owner) - module_put(de->owner); - de_put(de); - } -+#ifdef CONFIG_VE -+ de = GPDE(inode); -+ if (de) { -+ module_put(de->owner); -+ de_put(de); -+ } -+#endif - clear_inode(inode); - } - -@@ -100,6 +99,9 @@ static struct inode *proc_alloc_inode(st - ei->pde = NULL; - inode = &ei->vfs_inode; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; -+#ifdef CONFIG_VE -+ GPDE(inode) = NULL; -+#endif - return inode; - } - -@@ -209,6 +211,12 @@ int proc_fill_super(struct super_block * - s->s_root = d_alloc_root(root_inode); - if (!s->s_root) - goto out_no_root; -+#ifdef CONFIG_VE -+ LPDE(root_inode) = de_get(get_exec_env()->proc_root); -+ GPDE(root_inode) = &proc_root; -+#else -+ LPDE(root_inode) = &proc_root; -+#endif - return 0; - - out_no_root: -diff -upr linux-2.6.16.orig/fs/proc/kmsg.c linux-2.6.16-026test009/fs/proc/kmsg.c ---- linux-2.6.16.orig/fs/proc/kmsg.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/kmsg.c 2006-04-19 15:02:12.000000000 +0400 -@@ -11,6 +11,7 @@ - #include <linux/kernel.h> - #include <linux/poll.h> - #include <linux/fs.h> -+#include <linux/veprintk.h> - - #include <asm/uaccess.h> - #include <asm/io.h> -@@ -40,7 +41,7 @@ static ssize_t kmsg_read(struct file *fi - - static unsigned int kmsg_poll(struct file *file, poll_table *wait) - { -- poll_wait(file, &log_wait, wait); -+ poll_wait(file, &ve_log_wait, wait); - if (do_syslog(9, NULL, 0)) - return POLLIN | POLLRDNORM; - return 0; -diff -upr linux-2.6.16.orig/fs/proc/proc_misc.c linux-2.6.16-026test009/fs/proc/proc_misc.c ---- linux-2.6.16.orig/fs/proc/proc_misc.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/proc_misc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -32,6 +32,7 @@ - #include <linux/pagemap.h> - #include <linux/swap.h> - #include <linux/slab.h> -+#include <linux/virtinfo.h> - #include <linux/smp.h> - #include <linux/signal.h> - #include <linux/module.h> -@@ -45,6 +46,8 @@ - #include <linux/jiffies.h> - #include <linux/sysrq.h> - #include <linux/vmalloc.h> -+#include <linux/version.h> -+#include <linux/compile.h> - #include <linux/crash_dump.h> - #include <asm/uaccess.h> - #include <asm/pgtable.h> -@@ -53,8 +56,10 @@ - #include <asm/div64.h> - #include "internal.h" - --#define LOAD_INT(x) ((x) >> FSHIFT) --#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) -+#ifdef CONFIG_FAIRSCHED -+#include <linux/fairsched.h> -+#endif -+ - /* - * Warning: stuff below (imported functions) assumes that its output will fit - * into one page. For some of those functions it may be wrong. Moreover, we -@@ -84,15 +89,33 @@ static int loadavg_read_proc(char *page, - { - int a, b, c; - int len; -- -- a = avenrun[0] + (FIXED_1/200); -- b = avenrun[1] + (FIXED_1/200); -- c = avenrun[2] + (FIXED_1/200); -+ unsigned long __nr_running; -+ int __nr_threads; -+ unsigned long *__avenrun; -+ struct ve_struct *ve; -+ -+ ve = get_exec_env(); -+ -+ if (ve_is_super(ve)) { -+ __avenrun = &avenrun[0]; -+ __nr_running = nr_running(); -+ __nr_threads = nr_threads; -+ } -+#ifdef CONFIG_VE -+ else { -+ __avenrun = &ve->avenrun[0]; -+ __nr_running = nr_running_ve(ve); -+ __nr_threads = atomic_read(&ve->pcounter); -+ } -+#endif -+ a = __avenrun[0] + (FIXED_1/200); -+ b = __avenrun[1] + (FIXED_1/200); -+ c = __avenrun[2] + (FIXED_1/200); - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", - LOAD_INT(a), LOAD_FRAC(a), - LOAD_INT(b), LOAD_FRAC(b), - LOAD_INT(c), LOAD_FRAC(c), -- nr_running(), nr_threads, last_pid); -+ __nr_running, __nr_threads, last_pid); - return proc_calc_metrics(page, start, off, count, eof, len); - } - -@@ -105,6 +128,13 @@ static int uptime_read_proc(char *page, - cputime_t idletime = cputime_add(init_task.utime, init_task.stime); - - do_posix_clock_monotonic_gettime(&uptime); -+#ifdef CONFIG_VE -+ if (!ve_is_super(get_exec_env())) { -+ set_normalized_timespec(&uptime, -+ uptime.tv_sec - get_exec_env()->start_timespec.tv_sec, -+ uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec); -+ } -+#endif - cputime_to_timespec(idletime, &idle); - len = sprintf(page,"%lu.%02lu %lu.%02lu\n", - (unsigned long) uptime.tv_sec, -@@ -118,35 +148,37 @@ static int uptime_read_proc(char *page, - static int meminfo_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) - { -- struct sysinfo i; -+ struct meminfo mi; - int len; -- struct page_state ps; -- unsigned long inactive; -- unsigned long active; -- unsigned long free; -- unsigned long committed; -- unsigned long allowed; -+ unsigned long dummy; - struct vmalloc_info vmi; -- long cached; - -- get_page_state(&ps); -- get_zone_counts(&active, &inactive, &free); -+ get_page_state(&mi.ps); -+ get_zone_counts(&mi.active, &mi.inactive, &dummy); - - /* - * display in kilobytes. - */ - #define K(x) ((x) << (PAGE_SHIFT - 10)) -- si_meminfo(&i); -- si_swapinfo(&i); -- committed = atomic_read(&vm_committed_space); -- allowed = ((totalram_pages - hugetlb_total_pages()) -- * sysctl_overcommit_ratio / 100) + total_swap_pages; -+ si_meminfo(&mi.si); -+ si_swapinfo(&mi.si); -+ mi.committed_space = atomic_read(&vm_committed_space); -+ mi.swapcache = total_swapcache_pages; -+ mi.cache = get_page_cache_size() - mi.swapcache - mi.si.bufferram; -+ if (mi.cache < 0) -+ mi.cache = 0; - -- cached = get_page_cache_size() - total_swapcache_pages - i.bufferram; -- if (cached < 0) -- cached = 0; -+ mi.vmalloc_total = (VMALLOC_END - VMALLOC_START) >> PAGE_SHIFT; -+ mi.allowed = ((totalram_pages - hugetlb_total_pages()) -+ * sysctl_overcommit_ratio / 100) + total_swap_pages; - - get_vmalloc_info(&vmi); -+ mi.vmalloc_used = vmi.used >> PAGE_SHIFT; -+ mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT; -+ -+ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) -+ & NOTIFY_FAIL) -+ return -ENOMSG; - - /* - * Tagged format, for easy grepping and expansion. -@@ -175,29 +207,29 @@ static int meminfo_read_proc(char *page, - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", -- K(i.totalram), -- K(i.freeram), -- K(i.bufferram), -- K(cached), -- K(total_swapcache_pages), -- K(active), -- K(inactive), -- K(i.totalhigh), -- K(i.freehigh), -- K(i.totalram-i.totalhigh), -- K(i.freeram-i.freehigh), -- K(i.totalswap), -- K(i.freeswap), -- K(ps.nr_dirty), -- K(ps.nr_writeback), -- K(ps.nr_mapped), -- K(ps.nr_slab), -- K(allowed), -- K(committed), -- K(ps.nr_page_table_pages), -- (unsigned long)VMALLOC_TOTAL >> 10, -- vmi.used >> 10, -- vmi.largest_chunk >> 10 -+ K(mi.si.totalram), -+ K(mi.si.freeram), -+ K(mi.si.bufferram), -+ K(mi.cache), -+ K(mi.swapcache), -+ K(mi.active), -+ K(mi.inactive), -+ K(mi.si.totalhigh), -+ K(mi.si.freehigh), -+ K(mi.si.totalram-mi.si.totalhigh), -+ K(mi.si.freeram-mi.si.freehigh), -+ K(mi.si.totalswap), -+ K(mi.si.freeswap), -+ K(mi.ps.nr_dirty), -+ K(mi.ps.nr_writeback), -+ K(mi.ps.nr_mapped), -+ K(mi.ps.nr_slab), -+ K(mi.allowed), -+ K(mi.committed_space), -+ K(mi.ps.nr_page_table_pages), -+ K(mi.vmalloc_total), -+ K(mi.vmalloc_used), -+ K(mi.vmalloc_largest) - ); - - len += hugetlb_report_meminfo(page + len); -@@ -237,8 +269,15 @@ static int version_read_proc(char *page, - int count, int *eof, void *data) - { - int len; -+ struct new_utsname *utsname = &ve_utsname; - -- strcpy(page, linux_banner); -+ if (ve_is_super(get_exec_env())) -+ strcpy(page, linux_banner); -+ else -+ sprintf(page, "Linux version %s (" -+ LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" -+ LINUX_COMPILER ") %s\n", -+ utsname->release, utsname->version); - len = strlen(page); - return proc_calc_metrics(page, start, off, count, eof, len); - } -@@ -312,7 +351,7 @@ static void *devinfo_next(struct seq_fil - case BLK_HDR: - info->state = BLK_LIST; - (*pos)++; -- break; -+ /*fallthrough*/ - case BLK_LIST: - if (get_blkdev_info(info->blkdev,&idummy,&ndummy)) { - /* -@@ -487,18 +526,15 @@ static struct file_operations proc_slabi - }; - #endif - --static int show_stat(struct seq_file *p, void *v) -+static void show_stat_ve0(struct seq_file *p) - { - int i; -- unsigned long jif; -+ struct page_state page_state; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - u64 sum = 0; - - user = nice = system = idle = iowait = - irq = softirq = steal = cputime64_zero; -- jif = - wall_to_monotonic.tv_sec; -- if (wall_to_monotonic.tv_nsec) -- --jif; - - for_each_cpu(i) { - int j; -@@ -552,9 +588,84 @@ static int show_stat(struct seq_file *p, - for (i = 0; i < NR_IRQS; i++) - seq_printf(p, " %u", kstat_irqs(i)); - #endif -+ get_full_page_state(&page_state); -+ seq_printf(p, "\nswap %lu %lu\n", page_state.pswpin, page_state.pswpout); -+} -+ -+#ifdef CONFIG_VE -+static void show_stat_ve(struct seq_file *p, struct ve_struct *env) -+{ -+ int i; -+ u64 user, nice, system; -+ cycles_t idle, iowait; -+ cpumask_t ve_cpus; -+ -+ ve_cpu_online_map(env, &ve_cpus); -+ -+ user = nice = system = idle = iowait = 0; -+ for_each_cpu_mask(i, ve_cpus) { -+ user += VE_CPU_STATS(env, i)->user; -+ nice += VE_CPU_STATS(env, i)->nice; -+ system += VE_CPU_STATS(env, i)->system; -+ idle += ve_sched_get_idle_time(env, i); -+ iowait += ve_sched_get_iowait_time(env, i); -+ } -+ -+ seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n", -+ (unsigned long long)cputime64_to_clock_t(user), -+ (unsigned long long)cputime64_to_clock_t(nice), -+ (unsigned long long)cputime64_to_clock_t(system), -+ (unsigned long long)cycles_to_clocks(idle), -+ (unsigned long long)cycles_to_clocks(iowait)); -+ -+ for_each_cpu_mask(i, ve_cpus) { -+ user = VE_CPU_STATS(env, i)->user; -+ nice = VE_CPU_STATS(env, i)->nice; -+ system = VE_CPU_STATS(env, i)->system; -+ idle = ve_sched_get_idle_time(env, i); -+ iowait = ve_sched_get_iowait_time(env, i); -+ seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n", -+ i, -+ (unsigned long long)cputime64_to_clock_t(user), -+ (unsigned long long)cputime64_to_clock_t(nice), -+ (unsigned long long)cputime64_to_clock_t(system), -+ (unsigned long long)cycles_to_clocks(idle), -+ (unsigned long long)cycles_to_clocks(iowait)); -+ } -+ seq_printf(p, "intr 0\nswap 0 0\n"); -+} -+#endif -+ -+int show_stat(struct seq_file *p, void *v) -+{ -+ extern unsigned long total_forks; -+ unsigned long seq, jif; -+ struct ve_struct *env; -+ unsigned long __nr_running, __nr_iowait; -+ -+ do { -+ seq = read_seqbegin(&xtime_lock); -+ jif = - wall_to_monotonic.tv_sec; -+ if (wall_to_monotonic.tv_nsec) -+ --jif; -+ } while (read_seqretry(&xtime_lock, seq)); -+ -+ env = get_exec_env(); -+ if (ve_is_super(env)) { -+ show_stat_ve0(p); -+ __nr_running = nr_running(); -+ __nr_iowait = nr_iowait(); -+ } -+#ifdef CONFIG_VE -+ else { -+ show_stat_ve(p, env); -+ __nr_running = nr_running_ve(env); -+ __nr_iowait = nr_iowait_ve(env); -+ } -+#endif - - seq_printf(p, -- "\nctxt %llu\n" -+ "ctxt %llu\n" - "btime %lu\n" - "processes %lu\n" - "procs_running %lu\n" -@@ -562,8 +673,8 @@ static int show_stat(struct seq_file *p, - nr_context_switches(), - (unsigned long)jif, - total_forks, -- nr_running(), -- nr_iowait()); -+ __nr_running, -+ __nr_iowait); - - return 0; - } -@@ -652,7 +763,8 @@ static int cmdline_read_proc(char *page, - { - int len; - -- len = sprintf(page, "%s\n", saved_command_line); -+ len = sprintf(page, "%s\n", -+ ve_is_super(get_exec_env()) ? saved_command_line : ""); - return proc_calc_metrics(page, start, off, count, eof, len); - } - -diff -upr linux-2.6.16.orig/fs/proc/proc_tty.c linux-2.6.16-026test009/fs/proc/proc_tty.c ---- linux-2.6.16.orig/fs/proc/proc_tty.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/proc_tty.c 2006-04-19 15:02:12.000000000 +0400 -@@ -6,6 +6,7 @@ - - #include <asm/uaccess.h> - -+#include <linux/ve_owner.h> - #include <linux/init.h> - #include <linux/errno.h> - #include <linux/time.h> -@@ -106,24 +107,35 @@ static int show_tty_driver(struct seq_fi - /* iterator */ - static void *t_start(struct seq_file *m, loff_t *pos) - { -- struct list_head *p; -+ struct tty_driver *drv; -+ - loff_t l = *pos; -- list_for_each(p, &tty_drivers) -+ read_lock(&tty_driver_guard); -+ list_for_each_entry(drv, &tty_drivers, tty_drivers) { -+ if (!ve_accessible_strict(VE_OWNER_TTYDRV(drv), get_exec_env())) -+ continue; - if (!l--) -- return list_entry(p, struct tty_driver, tty_drivers); -+ return drv; -+ } - return NULL; - } - - static void *t_next(struct seq_file *m, void *v, loff_t *pos) - { -- struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next; -+ struct tty_driver *drv; -+ - (*pos)++; -- return p==&tty_drivers ? NULL : -- list_entry(p, struct tty_driver, tty_drivers); -+ drv = (struct tty_driver *)v; -+ list_for_each_entry_continue(drv, &tty_drivers, tty_drivers) { -+ if (ve_accessible_strict(VE_OWNER_TTYDRV(drv), get_exec_env())) -+ return drv; -+ } -+ return NULL; - } - - static void t_stop(struct seq_file *m, void *v) - { -+ read_unlock(&tty_driver_guard); - } - - static struct seq_operations tty_drivers_op = { -diff -upr linux-2.6.16.orig/fs/proc/root.c linux-2.6.16-026test009/fs/proc/root.c ---- linux-2.6.16.orig/fs/proc/root.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/root.c 2006-04-19 15:02:12.000000000 +0400 -@@ -20,7 +20,10 @@ - - #include "internal.h" - --struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; -+#ifndef CONFIG_VE -+struct proc_dir_entry *proc_net, *proc_net_stat; -+#endif -+struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver; - - #ifdef CONFIG_SYSCTL - struct proc_dir_entry *proc_sys_root; -@@ -32,12 +35,14 @@ static struct super_block *proc_get_sb(s - return get_sb_single(fs_type, flags, data, proc_fill_super); - } - --static struct file_system_type proc_fs_type = { -+struct file_system_type proc_fs_type = { - .name = "proc", - .get_sb = proc_get_sb, - .kill_sb = kill_anon_super, - }; - -+EXPORT_SYMBOL(proc_fs_type); -+ - void __init proc_root_init(void) - { - int err = proc_init_inodecache(); -@@ -157,7 +162,9 @@ EXPORT_SYMBOL(create_proc_entry); - EXPORT_SYMBOL(remove_proc_entry); - EXPORT_SYMBOL(proc_root); - EXPORT_SYMBOL(proc_root_fs); -+#ifndef CONFIG_VE - EXPORT_SYMBOL(proc_net); - EXPORT_SYMBOL(proc_net_stat); -+#endif - EXPORT_SYMBOL(proc_bus); - EXPORT_SYMBOL(proc_root_driver); -diff -upr linux-2.6.16.orig/fs/proc/task_mmu.c linux-2.6.16-026test009/fs/proc/task_mmu.c ---- linux-2.6.16.orig/fs/proc/task_mmu.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/task_mmu.c 2006-04-19 15:02:12.000000000 +0400 -@@ -90,9 +90,12 @@ int proc_exe_link(struct inode *inode, s - } - - if (vma) { -- *mnt = mntget(vma->vm_file->f_vfsmnt); -- *dentry = dget(vma->vm_file->f_dentry); -- result = 0; -+ result = d_root_check(vma->vm_file->f_dentry, -+ vma->vm_file->f_vfsmnt); -+ if (!result) { -+ *mnt = mntget(vma->vm_file->f_vfsmnt); -+ *dentry = dget(vma->vm_file->f_dentry); -+ } - } - - up_read(&mm->mmap_sem); -diff -upr linux-2.6.16.orig/fs/proc/task_nommu.c linux-2.6.16-026test009/fs/proc/task_nommu.c ---- linux-2.6.16.orig/fs/proc/task_nommu.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/task_nommu.c 2006-04-19 15:02:12.000000000 +0400 -@@ -126,9 +126,12 @@ int proc_exe_link(struct inode *inode, s - } - - if (vma) { -- *mnt = mntget(vma->vm_file->f_vfsmnt); -- *dentry = dget(vma->vm_file->f_dentry); -- result = 0; -+ result = d_root_check(vma->vm_file->f_dentry, -+ vma->vm_file->f_vfsmnt); -+ if (!result) { -+ *mnt = mntget(vma->vm_file->f_vfsmnt); -+ *dentry = dget(vma->vm_file->f_dentry); -+ } - } - - up_read(&mm->mmap_sem); -diff -upr linux-2.6.16.orig/fs/proc/vmcore.c linux-2.6.16-026test009/fs/proc/vmcore.c ---- linux-2.6.16.orig/fs/proc/vmcore.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/proc/vmcore.c 2006-04-19 15:02:11.000000000 +0400 -@@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file * - size_t buflen, loff_t *fpos) - { - ssize_t acc = 0, tmp; -- size_t tsz, nr_bytes; -- u64 start; -+ size_t tsz; -+ u64 start, nr_bytes; - struct vmcore *curr_m = NULL; - - if (buflen == 0 || *fpos >= vmcore_size) -diff -upr linux-2.6.16.orig/fs/quota.c linux-2.6.16-026test009/fs/quota.c ---- linux-2.6.16.orig/fs/quota.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/quota.c 2006-04-19 15:02:12.000000000 +0400 -@@ -81,11 +81,11 @@ static int generic_quotactl_valid(struct - if (cmd == Q_GETQUOTA) { - if (((type == USRQUOTA && current->euid != id) || - (type == GRPQUOTA && !in_egroup_p(id))) && -- !capable(CAP_SYS_ADMIN)) -+ !capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - } - else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) -- if (!capable(CAP_SYS_ADMIN)) -+ if (!capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - - return 0; -@@ -132,10 +132,10 @@ static int xqm_quotactl_valid(struct sup - if (cmd == Q_XGETQUOTA) { - if (((type == XQM_USRQUOTA && current->euid != id) || - (type == XQM_GRPQUOTA && !in_egroup_p(id))) && -- !capable(CAP_SYS_ADMIN)) -+ !capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { -- if (!capable(CAP_SYS_ADMIN)) -+ if (!capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - } - -@@ -216,7 +216,7 @@ restart: - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); -- if (sb->s_root && sb->s_qcop->quota_sync) -+ if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync) - quota_sync_sb(sb, type); - up_read(&sb->s_umount); - spin_lock(&sb_lock); -@@ -358,7 +358,7 @@ asmlinkage long sys_quotactl(unsigned in - tmp = getname(special); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); -- bdev = lookup_bdev(tmp); -+ bdev = lookup_bdev(tmp, FMODE_QUOTACTL); - putname(tmp); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); -diff -upr linux-2.6.16.orig/fs/reiserfs/namei.c linux-2.6.16-026test009/fs/reiserfs/namei.c ---- linux-2.6.16.orig/fs/reiserfs/namei.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/reiserfs/namei.c 2006-04-19 15:02:12.000000000 +0400 -@@ -864,6 +864,9 @@ static int reiserfs_rmdir(struct inode * - INITIALIZE_PATH(path); - struct reiserfs_dir_entry de; - -+ inode = dentry->d_inode; -+ DQUOT_INIT(inode); -+ - /* we will be doing 2 balancings and update 2 stat data, we change quotas - * of the owner of the directory and of the owner of the parent directory. - * The quota structure is possibly deleted only on last iput => outside -@@ -888,8 +891,6 @@ static int reiserfs_rmdir(struct inode * - goto end_rmdir; - } - -- inode = dentry->d_inode; -- - reiserfs_update_inode_transaction(inode); - reiserfs_update_inode_transaction(dir); - -@@ -952,6 +953,7 @@ static int reiserfs_unlink(struct inode - unsigned long savelink; - - inode = dentry->d_inode; -+ DQUOT_INIT(inode); - - /* in this transaction we can be doing at max two balancings and update - * two stat datas, we change quotas of the owner of the directory and of -@@ -1259,6 +1261,8 @@ static int reiserfs_rename(struct inode - - old_inode = old_dentry->d_inode; - new_dentry_inode = new_dentry->d_inode; -+ if (new_dentry_inode) -+ DQUOT_INIT(new_dentry_inode); - - // make sure, that oldname still exists and points to an object we - // are going to rename -diff -upr linux-2.6.16.orig/fs/reiserfs/xattr.c linux-2.6.16-026test009/fs/reiserfs/xattr.c ---- linux-2.6.16.orig/fs/reiserfs/xattr.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/reiserfs/xattr.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1343,7 +1343,8 @@ static int reiserfs_check_acl(struct ino - return error; - } - --int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) -+int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - /* - * We don't do permission checks on the internal objects. -@@ -1356,7 +1357,7 @@ int reiserfs_permission(struct inode *in - * Stat data v1 doesn't support ACLs. - */ - if (get_inode_sd_version(inode) == STAT_DATA_V1) -- return generic_permission(inode, mask, NULL); -+ return generic_permission(inode, mask, NULL, perm); - else -- return generic_permission(inode, mask, reiserfs_check_acl); -+ return generic_permission(inode, mask, reiserfs_check_acl, perm); - } -diff -upr linux-2.6.16.orig/fs/select.c linux-2.6.16-026test009/fs/select.c ---- linux-2.6.16.orig/fs/select.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/select.c 2006-04-19 15:02:11.000000000 +0400 -@@ -24,6 +24,8 @@ - #include <linux/fs.h> - #include <linux/rcupdate.h> - -+#include <ub/ub_mem.h> -+ - #include <asm/uaccess.h> - - #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) -@@ -286,7 +288,7 @@ int do_select(int n, fd_set_bits *fds, s - - static void *select_bits_alloc(int size) - { -- return kmalloc(6 * size, GFP_KERNEL); -+ return ub_kmalloc(6 * size, GFP_KERNEL); - } - - static void select_bits_free(void *bits, int size) -@@ -645,7 +647,7 @@ int do_sys_poll(struct pollfd __user *uf - err = -ENOMEM; - while(i!=0) { - struct poll_list *pp; -- pp = kmalloc(sizeof(struct poll_list)+ -+ pp = ub_kmalloc(sizeof(struct poll_list)+ - sizeof(struct pollfd)* - (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i), - GFP_KERNEL); -diff -upr linux-2.6.16.orig/fs/seq_file.c linux-2.6.16-026test009/fs/seq_file.c ---- linux-2.6.16.orig/fs/seq_file.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/seq_file.c 2006-04-19 15:02:12.000000000 +0400 -@@ -345,6 +345,8 @@ int seq_path(struct seq_file *m, - if (m->count < m->size) { - char *s = m->buf + m->count; - char *p = d_path(dentry, mnt, s, m->size - m->count); -+ if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG) -+ return 0; - if (!IS_ERR(p)) { - while (s <= p) { - char c = *p++; -diff -upr linux-2.6.16.orig/fs/simfs.c linux-2.6.16-026test009/fs/simfs.c ---- linux-2.6.16.orig/fs/simfs.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/fs/simfs.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,319 @@ -+/* -+ * fs/simfs.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/fs.h> -+#include <linux/file.h> -+#include <linux/init.h> -+#include <linux/namei.h> -+#include <linux/err.h> -+#include <linux/module.h> -+#include <linux/mount.h> -+#include <linux/vzquota.h> -+#include <linux/statfs.h> -+#include <linux/virtinfo.h> -+#include <linux/faudit.h> -+#include <linux/genhd.h> -+ -+#include <asm/unistd.h> -+#include <asm/uaccess.h> -+ -+#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb -+ -+static struct super_operations sim_super_ops; -+ -+static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry, -+ struct kstat *stat) -+{ -+ struct super_block *sb; -+ struct inode *inode; -+ -+ inode = dentry->d_inode; -+ if (!inode->i_op->getattr) { -+ generic_fillattr(inode, stat); -+ if (!stat->blksize) { -+ unsigned blocks; -+ -+ sb = inode->i_sb; -+ blocks = (stat->size + sb->s_blocksize-1) >> -+ sb->s_blocksize_bits; -+ stat->blocks = (sb->s_blocksize / 512) * blocks; -+ stat->blksize = sb->s_blocksize; -+ } -+ } else { -+ int err; -+ -+ err = inode->i_op->getattr(mnt, dentry, stat); -+ if (err) -+ return err; -+ } -+ -+ sb = mnt->mnt_sb; -+ if (sb->s_op == &sim_super_ops) -+ stat->dev = sb->s_dev; -+ return 0; -+} -+ -+static void quota_get_stat(struct super_block *sb, struct kstatfs *buf) -+{ -+ int err; -+ struct dq_stat qstat; -+ struct virt_info_quota q; -+ long free_file, adj_file; -+ s64 blk, free_blk, adj_blk; -+ int bsize_bits; -+ -+ q.super = sb; -+ q.qstat = &qstat; -+ err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q); -+ if (err != NOTIFY_OK) -+ return; -+ -+ bsize_bits = ffs(buf->f_bsize) - 1; -+ free_blk = (s64)(qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits; -+ if (free_blk < 0) -+ free_blk = 0; -+ /* -+ * In the regular case, we always set buf->f_bfree and buf->f_blocks to -+ * the values reported by quota. In case of real disk space shortage, -+ * we adjust the values. We want this adjustment to look as if the -+ * total disk space were reduced, not as if the usage were increased. -+ * -- SAW -+ */ -+ adj_blk = 0; -+ if (buf->f_bfree < free_blk) -+ adj_blk = free_blk - buf->f_bfree; -+ buf->f_bfree = (long)(free_blk - adj_blk); -+ -+ if (free_blk < buf->f_bavail) -+ buf->f_bavail = (long)free_blk; /* min(f_bavail, free_blk) */ -+ -+ blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk; -+ buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk; -+ -+ free_file = qstat.isoftlimit - qstat.icurrent; -+ if (free_file < 0) -+ free_file = 0; -+ if (buf->f_ffree == -1) -+ /* -+ * One filesystem uses -1 to represent the fact that it doesn't -+ * have a detached limit for inode number. -+ * May be, because -1 is a good pretendent for the maximum value -+ * of signed long type, may be, because it's just nice to have -+ * an exceptional case... Guess what that filesystem is :-) -+ * -- SAW -+ */ -+ buf->f_ffree = free_file; -+ adj_file = 0; -+ if (buf->f_ffree < free_file) -+ adj_file = free_file - buf->f_ffree; -+ buf->f_ffree = free_file - adj_file; -+ buf->f_files = qstat.isoftlimit - adj_file; -+} -+ -+static int sim_statfs(struct super_block *sb, struct statfs *buf) -+{ -+ int err; -+ struct super_block *lsb; -+ struct kstatfs statbuf; -+ -+ err = 0; -+ if (sb->s_op != &sim_super_ops) -+ goto out; -+ -+ lsb = SIMFS_GET_LOWER_FS_SB(sb); -+ -+ err = -ENOSYS; -+ if (lsb && lsb->s_op && lsb->s_op->statfs) -+ err = lsb->s_op->statfs(lsb, &statbuf); -+ if (err) -+ goto out; -+ -+ quota_get_stat(sb, &statbuf); -+ -+ buf->f_files = statbuf.f_files; -+ buf->f_ffree = statbuf.f_ffree; -+ buf->f_blocks = statbuf.f_blocks; -+ buf->f_bfree = statbuf.f_bfree; -+ buf->f_bavail = statbuf.f_bavail; -+out: -+ return err; -+} -+ -+static int sim_statfs64(struct super_block *sb, struct statfs64 *buf) -+{ -+ int err; -+ struct super_block *lsb; -+ struct kstatfs statbuf; -+ -+ err = 0; -+ if (sb->s_op != &sim_super_ops) -+ goto out; -+ -+ lsb = SIMFS_GET_LOWER_FS_SB(sb); -+ -+ err = -ENOSYS; -+ if (lsb && lsb->s_op && lsb->s_op->statfs) -+ err = lsb->s_op->statfs(lsb, &statbuf); -+ if (err) -+ goto out; -+ -+ quota_get_stat(sb, &statbuf); -+ -+ buf->f_files = (__u64)statbuf.f_files; -+ buf->f_ffree = (__u64)statbuf.f_ffree; -+ buf->f_blocks = (__u64)statbuf.f_blocks; -+ buf->f_bfree = (__u64)statbuf.f_bfree; -+ buf->f_bavail = (__u64)statbuf.f_bavail; -+out: -+ return err; -+} -+ -+static int sim_systemcall(struct vnotifier_block *me, unsigned long n, -+ void *d, int old_ret) -+{ -+ int err; -+ struct faudit_stat_arg *arg; -+ -+ arg = (struct faudit_stat_arg *)d; -+ switch (n) { -+ case VIRTINFO_FAUDIT_STAT: -+ err = sim_getattr(arg->mnt, arg->dentry, -+ (struct kstat *)arg->stat); -+ break; -+ case VIRTINFO_FAUDIT_STATFS: -+ err = sim_statfs(arg->mnt->mnt_sb, -+ (struct statfs *)arg->stat); -+ break; -+ case VIRTINFO_FAUDIT_STATFS64: -+ err = sim_statfs64(arg->mnt->mnt_sb, -+ (struct statfs64 *)arg->stat); -+ break; -+ default: -+ return old_ret; -+ } -+ arg->err = err; -+ return (err ? NOTIFY_BAD : NOTIFY_OK); -+} -+ -+static struct inode *sim_quota_root(struct super_block *sb) -+{ -+ return sb->s_root->d_inode; -+} -+ -+void sim_put_super(struct super_block *sb) -+{ -+ struct virt_info_quota viq; -+ -+ viq.super = sb; -+ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq); -+ bdput(sb->s_bdev); -+} -+ -+static struct super_operations sim_super_ops = { -+ .get_quota_root = sim_quota_root, -+ .put_super = sim_put_super, -+}; -+ -+static int sim_fill_super(struct super_block *s, void *data) -+{ -+ int err; -+ struct nameidata *nd; -+ -+ err = set_anon_super(s, NULL); -+ if (err) -+ goto out; -+ -+ err = 0; -+ nd = (struct nameidata *)data; -+ s->s_root = dget(nd->dentry); -+ s->s_op = &sim_super_ops; -+out: -+ return err; -+} -+ -+struct super_block *sim_get_sb(struct file_system_type *type, -+ int flags, const char *dev_name, void *opt) -+{ -+ int err; -+ struct nameidata nd; -+ struct super_block *sb; -+ struct block_device *bd; -+ struct virt_info_quota viq; -+ static struct hd_struct fake_hds; -+ -+ sb = ERR_PTR(-EINVAL); -+ if (opt == NULL) -+ goto out; -+ -+ err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); -+ sb = ERR_PTR(err); -+ if (err) -+ goto out; -+ -+ sb = sget(type, NULL, sim_fill_super, &nd); -+ if (IS_ERR(sb)) -+ goto out_path; -+ -+ bd = bdget(sb->s_dev); -+ if (!bd) -+ goto out_killsb; -+ -+ sb->s_bdev = bd; -+ bd->bd_part = &fake_hds; -+ viq.super = sb; -+ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq); -+out_path: -+ path_release(&nd); -+out: -+ return sb; -+ -+out_killsb: -+ up_write(&sb->s_umount); -+ deactivate_super(sb); -+ sb = ERR_PTR(-ENODEV); -+ goto out_path; -+} -+ -+static struct file_system_type sim_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "simfs", -+ .get_sb = sim_get_sb, -+ .kill_sb = kill_anon_super, -+}; -+ -+static struct vnotifier_block sim_syscalls = { -+ .notifier_call = sim_systemcall, -+}; -+ -+static int __init init_simfs(void) -+{ -+ int err; -+ -+ err = register_filesystem(&sim_fs_type); -+ if (err) -+ return err; -+ -+ virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls); -+ return 0; -+} -+ -+static void __exit exit_simfs(void) -+{ -+ virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls); -+ unregister_filesystem(&sim_fs_type); -+} -+ -+MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); -+MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System"); -+MODULE_LICENSE("GPL v2"); -+ -+module_init(init_simfs); -+module_exit(exit_simfs); -diff -upr linux-2.6.16.orig/fs/smbfs/file.c linux-2.6.16-026test009/fs/smbfs/file.c ---- linux-2.6.16.orig/fs/smbfs/file.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/smbfs/file.c 2006-04-19 15:02:11.000000000 +0400 -@@ -387,7 +387,8 @@ smb_file_release(struct inode *inode, st - * privileges, so we need our own check for this. - */ - static int --smb_file_permission(struct inode *inode, int mask, struct nameidata *nd) -+smb_file_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *perm) - { - int mode = inode->i_mode; - int error = 0; -diff -upr linux-2.6.16.orig/fs/smbfs/inode.c linux-2.6.16-026test009/fs/smbfs/inode.c ---- linux-2.6.16.orig/fs/smbfs/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/smbfs/inode.c 2006-04-19 15:02:11.000000000 +0400 -@@ -233,7 +233,7 @@ smb_invalidate_inodes(struct smb_sb_info - { - VERBOSE("\n"); - shrink_dcache_sb(SB_of(server)); -- invalidate_inodes(SB_of(server)); -+ invalidate_inodes(SB_of(server), 0); - } - - /* -diff -upr linux-2.6.16.orig/fs/stat.c linux-2.6.16-026test009/fs/stat.c ---- linux-2.6.16.orig/fs/stat.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/stat.c 2006-04-19 15:02:12.000000000 +0400 -@@ -15,6 +15,7 @@ - #include <linux/namei.h> - #include <linux/security.h> - #include <linux/syscalls.h> -+#include <linux/faudit.h> - - #include <asm/uaccess.h> - #include <asm/unistd.h> -@@ -42,11 +43,19 @@ int vfs_getattr(struct vfsmount *mnt, st - { - struct inode *inode = dentry->d_inode; - int retval; -+ struct faudit_stat_arg arg; - - retval = security_inode_getattr(mnt, dentry); - if (retval) - return retval; - -+ arg.mnt = mnt; -+ arg.dentry = dentry; -+ arg.stat = stat; -+ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg) -+ != NOTIFY_DONE) -+ return arg.err; -+ - if (inode->i_op->getattr) - return inode->i_op->getattr(mnt, dentry, stat); - -diff -upr linux-2.6.16.orig/fs/super.c linux-2.6.16-026test009/fs/super.c ---- linux-2.6.16.orig/fs/super.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/super.c 2006-04-19 15:02:12.000000000 +0400 -@@ -23,6 +23,7 @@ - #include <linux/config.h> - #include <linux/module.h> - #include <linux/slab.h> -+#include <linux/ve_owner.h> - #include <linux/init.h> - #include <linux/smp_lock.h> - #include <linux/acct.h> -@@ -69,6 +70,7 @@ static struct super_block *alloc_super(v - INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_files); - INIT_LIST_HEAD(&s->s_instances); -+ INIT_LIST_HEAD(&s->s_dshrinkers); - INIT_HLIST_HEAD(&s->s_anon); - INIT_LIST_HEAD(&s->s_inodes); - init_rwsem(&s->s_umount); -@@ -231,13 +233,14 @@ void generic_shutdown_super(struct super - if (root) { - sb->s_root = NULL; - shrink_dcache_parent(root); -- shrink_dcache_anon(&sb->s_anon); -+ shrink_dcache_anon(sb); - dput(root); -+ dcache_shrinker_wait_sb(sb); - fsync_super(sb); - lock_super(sb); - sb->s_flags &= ~MS_ACTIVE; - /* bad name - it should be evict_inodes() */ -- invalidate_inodes(sb); -+ invalidate_inodes(sb, 0); - lock_kernel(); - - if (sop->write_super && sb->s_dirt) -@@ -246,7 +249,7 @@ void generic_shutdown_super(struct super - sop->put_super(sb); - - /* Forget any remaining inodes */ -- if (invalidate_inodes(sb)) { -+ if (invalidate_inodes(sb, 1)) { - printk("VFS: Busy inodes after unmount of %s. " - "Self-destruct in 5 seconds. Have a nice day...\n", - sb->s_id); -@@ -481,11 +484,20 @@ asmlinkage long sys_ustat(unsigned dev, - struct super_block *s; - struct ustat tmp; - struct kstatfs sbuf; -- int err = -EINVAL; -+ dev_t kdev; -+ int err; -+ -+ kdev = new_decode_dev(dev); -+#ifdef CONFIG_VE -+ err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ); -+ if (err) -+ goto out; -+#endif - -- s = user_get_super(new_decode_dev(dev)); -- if (s == NULL) -- goto out; -+ err = -EINVAL; -+ s = user_get_super(kdev); -+ if (s == NULL) -+ goto out; - err = vfs_statfs(s, &sbuf); - drop_super(s); - if (err) -@@ -599,6 +611,13 @@ void emergency_remount(void) - static struct idr unnamed_dev_idr; - static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ - -+/* for compatibility with coreutils still unaware of new minor sizes */ -+int unnamed_dev_majors[] = { -+ 0, 144, 145, 146, 242, 243, 244, 245, -+ 246, 247, 248, 249, 250, 251, 252, 253 -+}; -+EXPORT_SYMBOL(unnamed_dev_majors); -+ - int set_anon_super(struct super_block *s, void *data) - { - int dev; -@@ -616,13 +635,13 @@ int set_anon_super(struct super_block *s - else if (error) - return -EAGAIN; - -- if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { -+ if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) { - spin_lock(&unnamed_dev_lock); - idr_remove(&unnamed_dev_idr, dev); - spin_unlock(&unnamed_dev_lock); - return -EMFILE; - } -- s->s_dev = MKDEV(0, dev & MINORMASK); -+ s->s_dev = make_unnamed_dev(dev); - return 0; - } - -@@ -630,8 +649,9 @@ EXPORT_SYMBOL(set_anon_super); - - void kill_anon_super(struct super_block *sb) - { -- int slot = MINOR(sb->s_dev); -+ int slot; - -+ slot = unnamed_dev_idx(sb->s_dev); - generic_shutdown_super(sb); - spin_lock(&unnamed_dev_lock); - idr_remove(&unnamed_dev_idr, slot); -diff -upr linux-2.6.16.orig/fs/sysfs/bin.c linux-2.6.16-026test009/fs/sysfs/bin.c ---- linux-2.6.16.orig/fs/sysfs/bin.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/sysfs/bin.c 2006-04-19 15:02:12.000000000 +0400 -@@ -120,6 +120,9 @@ static int open(struct inode * inode, st - struct bin_attribute * attr = to_bin_attr(file->f_dentry); - int error = -EINVAL; - -+ if (!ve_sysfs_alowed()) -+ return 0; -+ - if (!kobj || !attr) - goto Done; - -@@ -196,6 +199,9 @@ int sysfs_create_bin_file(struct kobject - - int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) - { -+ if (!ve_sysfs_alowed()) -+ return 0; -+ - sysfs_hash_and_remove(kobj->dentry,attr->attr.name); - return 0; - } -diff -upr linux-2.6.16.orig/fs/sysfs/dir.c linux-2.6.16-026test009/fs/sysfs/dir.c ---- linux-2.6.16.orig/fs/sysfs/dir.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/sysfs/dir.c 2006-04-19 15:02:12.000000000 +0400 -@@ -144,6 +144,9 @@ int sysfs_create_dir(struct kobject * ko - struct dentry * parent; - int error = 0; - -+ if (!ve_sysfs_alowed()) -+ return 0; -+ - BUG_ON(!kobj); - - if (kobj->parent) -@@ -278,10 +281,14 @@ void sysfs_remove_subdir(struct dentry * - - void sysfs_remove_dir(struct kobject * kobj) - { -- struct dentry * dentry = dget(kobj->dentry); -+ struct dentry * dentry; - struct sysfs_dirent * parent_sd; - struct sysfs_dirent * sd, * tmp; - -+ if (!ve_sysfs_alowed()) -+ return; -+ -+ dentry = dget(kobj->dentry); - if (!dentry) - return; - -@@ -302,6 +309,7 @@ void sysfs_remove_dir(struct kobject * k - * Drop reference from dget() on entrance. - */ - dput(dentry); -+ kobj->dentry = NULL; - } - - int sysfs_rename_dir(struct kobject * kobj, const char *new_name) -@@ -309,6 +317,9 @@ int sysfs_rename_dir(struct kobject * ko - int error = 0; - struct dentry * new_dentry, * parent; - -+ if (!ve_sysfs_alowed()) -+ return 0; -+ - if (!strcmp(kobject_name(kobj), new_name)) - return -EINVAL; - -diff -upr linux-2.6.16.orig/fs/sysfs/file.c linux-2.6.16-026test009/fs/sysfs/file.c ---- linux-2.6.16.orig/fs/sysfs/file.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/sysfs/file.c 2006-04-19 15:02:12.000000000 +0400 -@@ -183,7 +183,7 @@ fill_write_buffer(struct sysfs_buffer * - return -ENOMEM; - - if (count >= PAGE_SIZE) -- count = PAGE_SIZE; -+ count = PAGE_SIZE - 1; - error = copy_from_user(buffer->page,buf,count); - buffer->needs_read_fill = 1; - return error ? -EFAULT : count; -@@ -380,6 +380,9 @@ int sysfs_add_file(struct dentry * dir, - - int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) - { -+ if (!ve_sysfs_alowed()) -+ return 0; -+ - BUG_ON(!kobj || !kobj->dentry || !attr); - - return sysfs_add_file(kobj->dentry, attr, SYSFS_KOBJ_ATTR); -@@ -398,6 +401,9 @@ int sysfs_update_file(struct kobject * k - struct dentry * victim; - int res = -ENOENT; - -+ if (!ve_sysfs_alowed()) -+ return 0; -+ - mutex_lock(&dir->d_inode->i_mutex); - victim = lookup_one_len(attr->name, dir, strlen(attr->name)); - if (!IS_ERR(victim)) { -@@ -473,6 +479,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); - - void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) - { -+ if (!ve_sysfs_alowed()) -+ return; -+ - sysfs_hash_and_remove(kobj->dentry,attr->name); - } - -diff -upr linux-2.6.16.orig/fs/sysfs/group.c linux-2.6.16-026test009/fs/sysfs/group.c ---- linux-2.6.16.orig/fs/sysfs/group.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/sysfs/group.c 2006-04-19 15:02:12.000000000 +0400 -@@ -46,6 +46,9 @@ int sysfs_create_group(struct kobject * - struct dentry * dir; - int error; - -+ if (!ve_sysfs_alowed()) -+ return 0; -+ - BUG_ON(!kobj || !kobj->dentry); - - if (grp->name) { -@@ -68,6 +71,9 @@ void sysfs_remove_group(struct kobject * - { - struct dentry * dir; - -+ if (!ve_sysfs_alowed()) -+ return; -+ - if (grp->name) - dir = lookup_one_len(grp->name, kobj->dentry, - strlen(grp->name)); -diff -upr linux-2.6.16.orig/fs/sysfs/inode.c linux-2.6.16-026test009/fs/sysfs/inode.c ---- linux-2.6.16.orig/fs/sysfs/inode.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/sysfs/inode.c 2006-04-19 15:02:12.000000000 +0400 -@@ -8,14 +8,13 @@ - - #undef DEBUG - -+#include <linux/config.h> - #include <linux/pagemap.h> - #include <linux/namei.h> - #include <linux/backing-dev.h> - #include <linux/capability.h> - #include "sysfs.h" - --extern struct super_block * sysfs_sb; -- - static struct address_space_operations sysfs_aops = { - .readpage = simple_readpage, - .prepare_write = simple_prepare_write, -@@ -227,12 +226,16 @@ void sysfs_drop_dentry(struct sysfs_dire - void sysfs_hash_and_remove(struct dentry * dir, const char * name) - { - struct sysfs_dirent * sd; -- struct sysfs_dirent * parent_sd = dir->d_fsdata; -+ struct sysfs_dirent * parent_sd; -+ -+ if (!dir) -+ return; - - if (dir->d_inode == NULL) - /* no inode means this hasn't been made visible yet */ - return; - -+ parent_sd = dir->d_fsdata; - mutex_lock(&dir->d_inode->i_mutex); - list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (!sd->s_element) -diff -upr linux-2.6.16.orig/fs/sysfs/mount.c linux-2.6.16-026test009/fs/sysfs/mount.c ---- linux-2.6.16.orig/fs/sysfs/mount.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/sysfs/mount.c 2006-04-19 15:02:12.000000000 +0400 -@@ -7,6 +7,7 @@ - #include <linux/fs.h> - #include <linux/mount.h> - #include <linux/pagemap.h> -+#include <linux/module.h> - #include <linux/init.h> - - #include "sysfs.h" -@@ -14,8 +15,11 @@ - /* Random magic number */ - #define SYSFS_MAGIC 0x62656572 - -+#ifndef CONFIG_VE - struct vfsmount *sysfs_mount; - struct super_block * sysfs_sb = NULL; -+#endif -+ - kmem_cache_t *sysfs_dir_cachep; - - static struct super_operations sysfs_ops = { -@@ -31,6 +35,15 @@ static struct sysfs_dirent sysfs_root = - .s_iattr = NULL, - }; - -+#ifdef CONFIG_VE -+static void init_ve0_sysfs_root(void) -+{ -+ get_ve0()->sysfs_root = &sysfs_root; -+} -+ -+#define sysfs_root (*(get_exec_env()->sysfs_root)) -+#endif -+ - static int sysfs_fill_super(struct super_block *sb, void *data, int silent) - { - struct inode *inode; -@@ -72,16 +85,21 @@ static struct super_block *sysfs_get_sb( - return get_sb_single(fs_type, flags, data, sysfs_fill_super); - } - --static struct file_system_type sysfs_fs_type = { -+struct file_system_type sysfs_fs_type = { - .name = "sysfs", - .get_sb = sysfs_get_sb, - .kill_sb = kill_litter_super, - }; - -+EXPORT_SYMBOL(sysfs_fs_type); -+ - int __init sysfs_init(void) - { - int err = -ENOMEM; - -+#ifdef CONFIG_VE -+ init_ve0_sysfs_root(); -+#endif - sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", - sizeof(struct sysfs_dirent), - 0, 0, NULL, NULL); -diff -upr linux-2.6.16.orig/fs/sysfs/symlink.c linux-2.6.16-026test009/fs/sysfs/symlink.c ---- linux-2.6.16.orig/fs/sysfs/symlink.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/sysfs/symlink.c 2006-04-19 15:02:12.000000000 +0400 -@@ -66,6 +66,7 @@ static int sysfs_add_link(struct dentry - if (!error) - return 0; - -+ kobject_put(target); - kfree(sl->link_name); - exit2: - kfree(sl); -@@ -86,6 +87,9 @@ int sysfs_create_link(struct kobject * k - - BUG_ON(!kobj || !kobj->dentry || !name); - -+ if (!ve_sysfs_alowed()) -+ return 0; -+ - mutex_lock(&dentry->d_inode->i_mutex); - error = sysfs_add_link(dentry, name, target); - mutex_unlock(&dentry->d_inode->i_mutex); -@@ -101,6 +105,9 @@ int sysfs_create_link(struct kobject * k - - void sysfs_remove_link(struct kobject * kobj, const char * name) - { -+ if(!ve_sysfs_alowed()) -+ return; -+ - sysfs_hash_and_remove(kobj->dentry,name); - } - -diff -upr linux-2.6.16.orig/fs/sysfs/sysfs.h linux-2.6.16-026test009/fs/sysfs/sysfs.h ---- linux-2.6.16.orig/fs/sysfs/sysfs.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/sysfs/sysfs.h 2006-04-19 15:02:12.000000000 +0400 -@@ -1,5 +1,14 @@ - --extern struct vfsmount * sysfs_mount; -+#ifndef CONFIG_VE -+extern struct vfsmount *sysfs_mount; -+extern struct super_block *sysfs_sb; -+#define ve_sysfs_alowed() (1) -+#else -+#define sysfs_mount (get_exec_env()->sysfs_mnt) -+#define sysfs_sb (get_exec_env()->sysfs_sb) -+#define ve_sysfs_alowed() (sysfs_sb != NULL) -+#endif -+ - extern kmem_cache_t *sysfs_dir_cachep; - - extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); -@@ -19,7 +28,6 @@ extern void sysfs_drop_dentry(struct sys - extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); - - extern struct rw_semaphore sysfs_rename_sem; --extern struct super_block * sysfs_sb; - extern struct file_operations sysfs_dir_operations; - extern struct file_operations sysfs_file_operations; - extern struct file_operations bin_fops; -diff -upr linux-2.6.16.orig/fs/vzdq_file.c linux-2.6.16-026test009/fs/vzdq_file.c ---- linux-2.6.16.orig/fs/vzdq_file.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/fs/vzdq_file.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,852 @@ -+/* -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * This file contains Virtuozzo quota files as proc entry implementation. -+ * It is required for std quota tools to work correctly as they are expecting -+ * aquota.user and aquota.group files. -+ */ -+ -+#include <linux/ctype.h> -+#include <linux/slab.h> -+#include <linux/list.h> -+#include <linux/module.h> -+#include <linux/proc_fs.h> -+#include <linux/sysctl.h> -+#include <linux/mount.h> -+#include <linux/namespace.h> -+#include <linux/quotaio_v2.h> -+#include <asm/uaccess.h> -+ -+#include <linux/ve.h> -+#include <linux/ve_proto.h> -+#include <linux/vzdq_tree.h> -+#include <linux/vzquota.h> -+ -+/* ---------------------------------------------------------------------- -+ * -+ * File read operation -+ * -+ * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, -+ * perhaps) abuse vz_quota_sem. -+ * Taking a global semaphore for lengthy and user-controlled operations inside -+ * VPSs is not a good idea in general. -+ * In this case, the reasons for taking this semaphore are completely unclear, -+ * especially taking into account that the only function that has comments -+ * about the necessity to be called under this semaphore -+ * (create_proc_quotafile) is actually called OUTSIDE it. -+ * -+ * --------------------------------------------------------------------- */ -+ -+#define DQBLOCK_SIZE 1024 -+#define DQUOTBLKNUM 21U -+#define DQTREE_DEPTH 4 -+#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1) -+#define ISINDBLOCK(num) ((num)%2 != 0) -+#define FIRST_DATABLK 2 /* first even number */ -+#define LAST_IND_LEVEL (DQTREE_DEPTH - 1) -+#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS)) -+#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \ -+ & QUOTATREE_BMASK) -+ -+#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH) -+#error xBITS and DQTREE_DEPTH does not correspond -+#endif -+ -+#define BLOCK_NOT_FOUND 1 -+ -+/* data for quota file -- one per proc entry */ -+struct quotatree_data { -+ struct list_head list; -+ struct vz_quota_master *qmblk; -+ int type; /* type of the tree */ -+}; -+ -+/* serialized by vz_quota_sem */ -+static LIST_HEAD(qf_data_head); -+ -+static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; -+static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS; -+ -+static inline loff_t get_depoff(int depth) -+{ -+ loff_t res = 1; -+ while (depth) { -+ res += (1 << ((depth - 1)*QUOTAID_EBITS + 1)); -+ depth--; -+ } -+ return res; -+} -+ -+static inline loff_t get_blknum(loff_t num, int depth) -+{ -+ loff_t res; -+ res = (num << 1) + get_depoff(depth); -+ return res; -+} -+ -+static int get_depth(loff_t num) -+{ -+ int i; -+ for (i = 0; i < DQTREE_DEPTH; i++) { -+ if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1 -+ || num < get_depoff(i + 1))) -+ return i; -+ } -+ return -1; -+} -+ -+static inline loff_t get_offset(loff_t num) -+{ -+ loff_t res, tmp; -+ -+ tmp = get_depth(num); -+ if (tmp < 0) -+ return -1; -+ num -= get_depoff(tmp); -+ BUG_ON(num < 0); -+ res = num >> 1; -+ -+ return res; -+} -+ -+static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level) -+{ -+ /* return maximum available block num */ -+ return tree->levels[level].freenum; -+} -+ -+static inline loff_t get_block_num(struct quotatree_tree *tree) -+{ -+ loff_t ind_blk_num, quot_blk_num, max_ind, max_quot; -+ -+ quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1); -+ max_quot = TREENUM_2_BLKNUM(quot_blk_num); -+ ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1)); -+ max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL) -+ : get_blknum(ind_blk_num, 0); -+ -+ return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1; -+} -+ -+/* Write quota file header */ -+static int read_header(void *buf, struct quotatree_tree *tree, -+ struct dq_info *dq_ugid_info, int type) -+{ -+ struct v2_disk_dqheader *dqh; -+ struct v2_disk_dqinfo *dq_disk_info; -+ -+ dqh = buf; -+ dq_disk_info = buf + sizeof(struct v2_disk_dqheader); -+ -+ dqh->dqh_magic = vzquota_magics[type]; -+ dqh->dqh_version = vzquota_versions[type]; -+ -+ dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire; -+ dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire; -+ dq_disk_info->dqi_flags = 0; /* no flags */ -+ dq_disk_info->dqi_blocks = get_block_num(tree); -+ dq_disk_info->dqi_free_blk = 0; /* first block in the file */ -+ dq_disk_info->dqi_free_entry = FIRST_DATABLK; -+ -+ return 0; -+} -+ -+static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf) -+{ -+ int i, j, lev_num; -+ -+ lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1; -+ for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) { -+ struct quotatree_node *next, *parent; -+ -+ parent = p; -+ next = p; -+ for (j = lev_num; j >= 0; j--) { -+ if (!next->blocks[GETLEVINDX(i,j)]) { -+ buf[i] = 0; -+ goto bad_branch; -+ } -+ parent = next; -+ next = next->blocks[GETLEVINDX(i,j)]; -+ } -+ buf[i] = (depth == DQTREE_DEPTH - 1) ? -+ TREENUM_2_BLKNUM(parent->num) -+ : get_blknum(next->num, depth + 1); -+ -+ bad_branch: -+ ; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Write index block to disk (or buffer) -+ * @buf has length 256*sizeof(u_int32_t) bytes -+ */ -+static int read_index_block(int num, u_int32_t *buf, -+ struct quotatree_tree *tree) -+{ -+ struct quotatree_node *p; -+ u_int32_t index; -+ loff_t off; -+ int depth, res; -+ -+ res = BLOCK_NOT_FOUND; -+ index = 0; -+ depth = get_depth(num); -+ off = get_offset(num); -+ if (depth < 0 || off < 0) -+ return -EINVAL; -+ -+ list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh, -+ list) { -+ if (p->num >= off) -+ res = 0; -+ if (p->num != off) -+ continue; -+ get_block_child(depth, p, buf); -+ break; -+ } -+ -+ return res; -+} -+ -+static inline void convert_quot_format(struct v2_disk_dqblk *dq, -+ struct vz_quota_ugid *vzq) -+{ -+ dq->dqb_id = vzq->qugid_id; -+ dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit; -+ dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit; -+ dq->dqb_curinodes = vzq->qugid_stat.icurrent; -+ dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE; -+ dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE; -+ dq->dqb_curspace = vzq->qugid_stat.bcurrent; -+ dq->dqb_btime = vzq->qugid_stat.btime; -+ dq->dqb_itime = vzq->qugid_stat.itime; -+} -+ -+static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree) -+{ -+ int res, i, entries = 0; -+ struct v2_disk_dqdbheader *dq_header; -+ struct quotatree_node *p; -+ struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader); -+ -+ res = BLOCK_NOT_FOUND; -+ dq_header = buf; -+ memset(dq_header, 0, sizeof(*dq_header)); -+ -+ list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh), -+ list) { -+ if (TREENUM_2_BLKNUM(p->num) >= num) -+ res = 0; -+ if (TREENUM_2_BLKNUM(p->num) != num) -+ continue; -+ -+ for (i = 0; i < QUOTATREE_BSIZE; i++) { -+ if (!p->blocks[i]) -+ continue; -+ convert_quot_format(blk + entries, -+ (struct vz_quota_ugid *)p->blocks[i]); -+ entries++; -+ res = 0; -+ } -+ break; -+ } -+ dq_header->dqdh_entries = entries; -+ -+ return res; -+} -+ -+static int read_block(int num, void *buf, struct quotatree_tree *tree, -+ struct dq_info *dq_ugid_info, int magic) -+{ -+ int res; -+ -+ memset(buf, 0, DQBLOCK_SIZE); -+ if (!num) -+ res = read_header(buf, tree, dq_ugid_info, magic); -+ else if (ISINDBLOCK(num)) -+ res = read_index_block(num, (u_int32_t*)buf, tree); -+ else -+ res = read_dquot(num, buf, tree); -+ -+ return res; -+} -+ -+/* -+ * FIXME: this function can handle quota files up to 2GB only. -+ */ -+static int read_proc_quotafile(char *page, char **start, off_t off, int count, -+ int *eof, void *data) -+{ -+ off_t blk_num, blk_off, buf_off; -+ char *tmp; -+ size_t buf_size; -+ struct quotatree_data *qtd; -+ struct quotatree_tree *tree; -+ struct dq_info *dqi; -+ int res; -+ -+ qtd = data; -+ down(&vz_quota_sem); -+ down(&qtd->qmblk->dq_sem); -+ -+ res = 0; -+ tree = QUGID_TREE(qtd->qmblk, qtd->type); -+ if (!tree) { -+ *eof = 1; -+ goto out_dq; -+ } -+ -+ res = -ENOMEM; -+ tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL); -+ if (!tmp) -+ goto out_dq; -+ -+ dqi = &qtd->qmblk->dq_ugid_info[qtd->type]; -+ -+ buf_off = 0; -+ buf_size = count; -+ blk_num = off / DQBLOCK_SIZE; -+ blk_off = off % DQBLOCK_SIZE; -+ -+ while (buf_size > 0) { -+ off_t len; -+ -+ len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size); -+ res = read_block(blk_num, tmp, tree, dqi, qtd->type); -+ if (res < 0) -+ goto out_err; -+ if (res == BLOCK_NOT_FOUND) { -+ *eof = 1; -+ break; -+ } -+ memcpy(page + buf_off, tmp + blk_off, len); -+ -+ blk_num++; -+ buf_size -= len; -+ blk_off = 0; -+ buf_off += len; -+ } -+ res = buf_off; -+ -+out_err: -+ kfree(tmp); -+ *start = NULL + count; -+out_dq: -+ up(&qtd->qmblk->dq_sem); -+ up(&vz_quota_sem); -+ -+ return res; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * /proc/vz/vzaquota/QID/aquota.* files -+ * -+ * FIXME: this code lacks serialization of read/readdir/lseek. -+ * However, this problem should be fixed after the mainstream issue of what -+ * appears to be non-atomic read and update of file position in sys_read. -+ * -+ * --------------------------------------------------------------------- */ -+ -+static inline unsigned long vzdq_aquot_getino(dev_t dev) -+{ -+ return 0xec000000UL + dev; -+} -+ -+static inline dev_t vzdq_aquot_getidev(struct inode *inode) -+{ -+ return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link; -+} -+ -+static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev) -+{ -+ PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev; -+} -+ -+static ssize_t vzdq_aquotf_read(struct file *file, -+ char __user *buf, size_t size, loff_t *ppos) -+{ -+ char *page; -+ size_t bufsize; -+ ssize_t l, l2, copied; -+ char *start; -+ struct inode *inode; -+ struct block_device *bdev; -+ struct super_block *sb; -+ struct quotatree_data data; -+ int eof, err; -+ -+ err = -ENOMEM; -+ page = (char *)__get_free_page(GFP_KERNEL); -+ if (page == NULL) -+ goto out_err; -+ -+ err = -ENODEV; -+ inode = file->f_dentry->d_inode; -+ bdev = bdget(vzdq_aquot_getidev(inode)); -+ if (bdev == NULL) -+ goto out_err; -+ sb = get_super(bdev); -+ bdput(bdev); -+ if (sb == NULL) -+ goto out_err; -+ data.qmblk = vzquota_find_qmblk(sb); -+ data.type = PROC_I(inode)->type - 1; -+ drop_super(sb); -+ if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD) -+ goto out_err; -+ -+ copied = 0; -+ l = l2 = 0; -+ while (1) { -+ bufsize = min(size, (size_t)PAGE_SIZE); -+ if (bufsize <= 0) -+ break; -+ -+ l = read_proc_quotafile(page, &start, *ppos, bufsize, -+ &eof, &data); -+ if (l <= 0) -+ break; -+ -+ l2 = copy_to_user(buf, page, l); -+ copied += l - l2; -+ if (l2) -+ break; -+ -+ buf += l; -+ size -= l; -+ *ppos += (unsigned long)start; -+ l = l2 = 0; -+ } -+ -+ qmblk_put(data.qmblk); -+ free_page((unsigned long)page); -+ if (copied) -+ return copied; -+ else if (l2) /* last copy_to_user failed */ -+ return -EFAULT; -+ else /* read error or EOF */ -+ return l; -+ -+out_err: -+ if (page != NULL) -+ free_page((unsigned long)page); -+ return err; -+} -+ -+static struct file_operations vzdq_aquotf_file_operations = { -+ .read = &vzdq_aquotf_read, -+}; -+ -+static struct inode_operations vzdq_aquotf_inode_operations = { -+}; -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * /proc/vz/vzaquota/QID directory -+ * -+ * --------------------------------------------------------------------- */ -+ -+static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler) -+{ -+ loff_t n; -+ int err; -+ -+ n = file->f_pos; -+ for (err = 0; !err; n++) { -+ switch (n) { -+ case 0: -+ err = (*filler)(data, ".", 1, n, -+ file->f_dentry->d_inode->i_ino, -+ DT_DIR); -+ break; -+ case 1: -+ err = (*filler)(data, "..", 2, n, -+ parent_ino(file->f_dentry), DT_DIR); -+ break; -+ case 2: -+ err = (*filler)(data, "aquota.user", 11, n, -+ file->f_dentry->d_inode->i_ino -+ + USRQUOTA + 1, -+ DT_REG); -+ break; -+ case 3: -+ err = (*filler)(data, "aquota.group", 12, n, -+ file->f_dentry->d_inode->i_ino -+ + GRPQUOTA + 1, -+ DT_REG); -+ break; -+ default: -+ goto out; -+ } -+ } -+out: -+ file->f_pos = n; -+ return err; -+} -+ -+struct vzdq_aquotq_lookdata { -+ dev_t dev; -+ int type; -+}; -+ -+static int vzdq_aquotq_looktest(struct inode *inode, void *data) -+{ -+ struct vzdq_aquotq_lookdata *d; -+ -+ d = data; -+ return inode->i_op == &vzdq_aquotf_inode_operations && -+ vzdq_aquot_getidev(inode) == d->dev && -+ PROC_I(inode)->type == d->type + 1; -+} -+ -+static int vzdq_aquotq_lookset(struct inode *inode, void *data) -+{ -+ struct vzdq_aquotq_lookdata *d; -+ -+ d = data; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; -+ inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1; -+ inode->i_mode = S_IFREG | S_IRUSR; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 1; -+ inode->i_op = &vzdq_aquotf_inode_operations; -+ inode->i_fop = &vzdq_aquotf_file_operations; -+ PROC_I(inode)->type = d->type + 1; -+ vzdq_aquot_setidev(inode, d->dev); -+ return 0; -+} -+ -+static struct dentry *vzdq_aquotq_lookup(struct inode *dir, -+ struct dentry *dentry, -+ struct nameidata *nd) -+{ -+ struct inode *inode; -+ struct vzdq_aquotq_lookdata d; -+ int k; -+ -+ if (dentry->d_name.len == 11) { -+ if (memcmp(dentry->d_name.name, "aquota.user", 11)) -+ goto out; -+ k = USRQUOTA; -+ } else if (dentry->d_name.len == 12) { -+ if (memcmp(dentry->d_name.name, "aquota.group", 11)) -+ goto out; -+ k = GRPQUOTA; -+ } else -+ goto out; -+ d.dev = vzdq_aquot_getidev(dir); -+ d.type = k; -+ inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1, -+ vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d); -+ if (inode == NULL) -+ goto out; -+ unlock_new_inode(inode); -+ d_add(dentry, inode); -+ return NULL; -+ -+out: -+ return ERR_PTR(-ENOENT); -+} -+ -+static struct file_operations vzdq_aquotq_file_operations = { -+ .read = &generic_read_dir, -+ .readdir = &vzdq_aquotq_readdir, -+}; -+ -+static struct inode_operations vzdq_aquotq_inode_operations = { -+ .lookup = &vzdq_aquotq_lookup, -+}; -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * /proc/vz/vzaquota directory -+ * -+ * --------------------------------------------------------------------- */ -+ -+struct vzdq_aquot_de { -+ struct list_head list; -+ struct vfsmount *mnt; -+}; -+ -+static int vzdq_aquot_buildmntlist(struct ve_struct *ve, -+ struct list_head *head) -+{ -+ struct vfsmount *rmnt, *mnt; -+ struct vzdq_aquot_de *p; -+ int err; -+ -+#ifdef CONFIG_VE -+ rmnt = mntget(ve->fs_rootmnt); -+#else -+ read_lock(¤t->fs->lock); -+ rmnt = mntget(current->fs->rootmnt); -+ read_unlock(¤t->fs->lock); -+#endif -+ mnt = rmnt; -+ spin_lock(&vfsmount_lock); -+ while (1) { -+ list_for_each_entry(p, head, list) { -+ if (p->mnt->mnt_sb == mnt->mnt_sb) -+ goto skip; -+ } -+ -+ err = -ENOMEM; -+ p = kmalloc(sizeof(*p), GFP_KERNEL); -+ if (p == NULL) -+ goto out; -+ p->mnt = mntget(mnt); -+ list_add_tail(&p->list, head); -+ -+skip: -+ err = 0; -+ if (list_empty(&mnt->mnt_mounts)) { -+ while (1) { -+ if (mnt == rmnt) -+ goto out; -+ if (mnt->mnt_child.next != -+ &mnt->mnt_parent->mnt_mounts) -+ break; -+ mnt = mnt->mnt_parent; -+ } -+ mnt = list_entry(mnt->mnt_child.next, -+ struct vfsmount, mnt_child); -+ } else -+ mnt = list_entry(mnt->mnt_mounts.next, -+ struct vfsmount, mnt_child); -+ } -+out: -+ spin_unlock(&vfsmount_lock); -+ mntput(rmnt); -+ return err; -+} -+ -+static void vzdq_aquot_releasemntlist(struct ve_struct *ve, -+ struct list_head *head) -+{ -+ struct vzdq_aquot_de *p; -+ -+ while (!list_empty(head)) { -+ p = list_entry(head->next, typeof(*p), list); -+ mntput(p->mnt); -+ list_del(&p->list); -+ kfree(p); -+ } -+} -+ -+static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler) -+{ -+ struct ve_struct *ve, *old_ve; -+ struct list_head mntlist; -+ struct vzdq_aquot_de *de; -+ struct super_block *sb; -+ struct vz_quota_master *qmblk; -+ loff_t i, n; -+ char buf[24]; -+ int l, err; -+ -+ i = 0; -+ n = file->f_pos; -+ ve = VE_OWNER_FSTYPE(file->f_dentry->d_sb->s_type); -+ old_ve = set_exec_env(ve); -+ -+ INIT_LIST_HEAD(&mntlist); -+#ifdef CONFIG_VE -+ /* -+ * The only reason of disabling readdir for the host system is that -+ * this readdir can be slow and CPU consuming with large number of VPSs -+ * (or just mount points). -+ */ -+ err = ve_is_super(ve); -+#else -+ err = 0; -+#endif -+ if (!err) { -+ err = vzdq_aquot_buildmntlist(ve, &mntlist); -+ if (err) -+ goto out_err; -+ } -+ -+ if (i >= n) { -+ if ((*filler)(data, ".", 1, i, -+ file->f_dentry->d_inode->i_ino, DT_DIR)) -+ goto out_fill; -+ } -+ i++; -+ -+ if (i >= n) { -+ if ((*filler)(data, "..", 2, i, -+ parent_ino(file->f_dentry), DT_DIR)) -+ goto out_fill; -+ } -+ i++; -+ -+ list_for_each_entry (de, &mntlist, list) { -+ sb = de->mnt->mnt_sb; -+#ifdef CONFIG_VE -+ if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL)) -+ continue; -+#endif -+ qmblk = vzquota_find_qmblk(sb); -+ if (qmblk == NULL || qmblk == VZ_QUOTA_BAD) -+ continue; -+ -+ qmblk_put(qmblk); -+ i++; -+ if (i <= n) -+ continue; -+ -+ l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev)); -+ if ((*filler)(data, buf, l, i - 1, -+ vzdq_aquot_getino(sb->s_dev), DT_DIR)) -+ break; -+ } -+ -+out_fill: -+ err = 0; -+ file->f_pos = i; -+out_err: -+ vzdq_aquot_releasemntlist(ve, &mntlist); -+ (void)set_exec_env(old_ve); -+ return err; -+} -+ -+static int vzdq_aquotd_looktest(struct inode *inode, void *data) -+{ -+ return inode->i_op == &vzdq_aquotq_inode_operations && -+ vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data; -+} -+ -+static int vzdq_aquotd_lookset(struct inode *inode, void *data) -+{ -+ dev_t dev; -+ -+ dev = (dev_t)(unsigned long)data; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; -+ inode->i_ino = vzdq_aquot_getino(dev); -+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 2; -+ inode->i_op = &vzdq_aquotq_inode_operations; -+ inode->i_fop = &vzdq_aquotq_file_operations; -+ vzdq_aquot_setidev(inode, dev); -+ return 0; -+} -+ -+static struct dentry *vzdq_aquotd_lookup(struct inode *dir, -+ struct dentry *dentry, -+ struct nameidata *nd) -+{ -+ struct ve_struct *ve, *old_ve; -+ const unsigned char *s; -+ int l; -+ dev_t dev; -+ struct inode *inode; -+ -+ ve = VE_OWNER_FSTYPE(dir->i_sb->s_type); -+ old_ve = set_exec_env(ve); -+#ifdef CONFIG_VE -+ /* -+ * Lookup is much lighter than readdir, so it can be allowed for the -+ * host system. But it would be strange to be able to do lookup only -+ * without readdir... -+ */ -+ if (ve_is_super(ve)) -+ goto out; -+#endif -+ -+ dev = 0; -+ l = dentry->d_name.len; -+ if (l <= 0) -+ goto out; -+ for (s = dentry->d_name.name; l > 0; s++, l--) { -+ if (!isxdigit(*s)) -+ goto out; -+ if (dev & ~(~0UL >> 4)) -+ goto out; -+ dev <<= 4; -+ if (isdigit(*s)) -+ dev += *s - '0'; -+ else if (islower(*s)) -+ dev += *s - 'a' + 10; -+ else -+ dev += *s - 'A' + 10; -+ } -+ dev = new_decode_dev(dev); -+ -+#ifdef CONFIG_VE -+ if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL)) -+ goto out; -+#endif -+ -+ inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev), -+ vzdq_aquotd_looktest, vzdq_aquotd_lookset, -+ (void *)(unsigned long)dev); -+ if (inode == NULL) -+ goto out; -+ unlock_new_inode(inode); -+ -+ d_add(dentry, inode); -+ (void)set_exec_env(old_ve); -+ return NULL; -+ -+out: -+ (void)set_exec_env(old_ve); -+ return ERR_PTR(-ENOENT); -+} -+ -+static struct file_operations vzdq_aquotd_file_operations = { -+ .read = &generic_read_dir, -+ .readdir = &vzdq_aquotd_readdir, -+}; -+ -+static struct inode_operations vzdq_aquotd_inode_operations = { -+ .lookup = &vzdq_aquotd_lookup, -+}; -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Initialization and deinitialization -+ * -+ * --------------------------------------------------------------------- */ -+ -+/* -+ * FIXME: creation of proc entries here is unsafe with respect to module -+ * unloading. -+ */ -+void vzaquota_init(void) -+{ -+ struct proc_dir_entry *de; -+ -+ de = create_proc_glob_entry("vz/vzaquota", -+ S_IFDIR | S_IRUSR | S_IXUSR, NULL); -+ if (de != NULL) { -+ de->proc_iops = &vzdq_aquotd_inode_operations; -+ de->proc_fops = &vzdq_aquotd_file_operations; -+ } else -+ printk("VZDQ: vz/vzaquota creation failed\n"); -+#if defined(CONFIG_SYSCTL) -+ de = create_proc_glob_entry("sys/fs/quota", -+ S_IFDIR | S_IRUSR | S_IXUSR, NULL); -+ if (de == NULL) -+ printk("VZDQ: sys/fs/quota creation failed\n"); -+#endif -+} -+ -+void vzaquota_fini(void) -+{ -+} -diff -upr linux-2.6.16.orig/fs/vzdq_mgmt.c linux-2.6.16-026test009/fs/vzdq_mgmt.c ---- linux-2.6.16.orig/fs/vzdq_mgmt.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/fs/vzdq_mgmt.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,735 @@ -+/* -+ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ */ -+ -+#include <linux/config.h> -+#include <linux/kernel.h> -+#include <linux/string.h> -+#include <linux/list.h> -+#include <asm/semaphore.h> -+#include <linux/sched.h> -+#include <linux/fs.h> -+#include <linux/dcache.h> -+#include <linux/mount.h> -+#include <linux/namei.h> -+#include <linux/writeback.h> -+#include <linux/gfp.h> -+#include <asm/uaccess.h> -+#include <linux/proc_fs.h> -+#include <linux/quota.h> -+#include <linux/vzctl_quota.h> -+#include <linux/vzquota.h> -+ -+ -+/* ---------------------------------------------------------------------- -+ * Switching quota on. -+ * --------------------------------------------------------------------- */ -+ -+/* -+ * check limits copied from user -+ */ -+int vzquota_check_sane_limits(struct dq_stat *qstat) -+{ -+ int err; -+ -+ err = -EINVAL; -+ -+ /* softlimit must be less then hardlimit */ -+ if (qstat->bsoftlimit > qstat->bhardlimit) -+ goto out; -+ -+ if (qstat->isoftlimit > qstat->ihardlimit) -+ goto out; -+ -+ err = 0; -+out: -+ return err; -+} -+ -+/* -+ * check usage values copied from user -+ */ -+int vzquota_check_sane_values(struct dq_stat *qstat) -+{ -+ int err; -+ -+ err = -EINVAL; -+ -+ /* expiration time must not be set if softlimit was not exceeded */ -+ if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != (time_t)0) -+ goto out; -+ -+ if (qstat->icurrent < qstat->isoftlimit && qstat->itime != (time_t)0) -+ goto out; -+ -+ err = vzquota_check_sane_limits(qstat); -+out: -+ return err; -+} -+ -+/* -+ * create new quota master block -+ * this function should: -+ * - copy limits and usage parameters from user buffer; -+ * - allock, initialize quota block and insert it to hash; -+ */ -+static int vzquota_create(unsigned int quota_id, struct vz_quota_stat *u_qstat) -+{ -+ int err; -+ struct vz_quota_stat qstat; -+ struct vz_quota_master *qmblk; -+ -+ down(&vz_quota_sem); -+ -+ err = -EFAULT; -+ if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) -+ goto out; -+ -+ err = -EINVAL; -+ if (quota_id == 0) -+ goto out; -+ -+ if (vzquota_check_sane_values(&qstat.dq_stat)) -+ goto out; -+ err = 0; -+ qmblk = vzquota_alloc_master(quota_id, &qstat); -+ -+ if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ -+ err = PTR_ERR(qmblk); -+out: -+ up(&vz_quota_sem); -+ -+ return err; -+} -+ -+/** -+ * vzquota_on - turn quota on -+ * -+ * This function should: -+ * - find and get refcnt of directory entry for quota root and corresponding -+ * mountpoint; -+ * - find corresponding quota block and mark it with given path; -+ * - check quota tree; -+ * - initialize quota for the tree root. -+ */ -+static int vzquota_on(unsigned int quota_id, const char *quota_root) -+{ -+ int err; -+ struct nameidata nd; -+ struct vz_quota_master *qmblk; -+ struct super_block *dqsb; -+ -+ dqsb = NULL; -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = -EBUSY; -+ if (qmblk->dq_state != VZDQ_STARTING) -+ goto out; -+ -+ err = user_path_walk(quota_root, &nd); -+ if (err) -+ goto out; -+ /* init path must be a directory */ -+ err = -ENOTDIR; -+ if (!S_ISDIR(nd.dentry->d_inode->i_mode)) -+ goto out_path; -+ -+ qmblk->dq_root_dentry = nd.dentry; -+ qmblk->dq_root_mnt = nd.mnt; -+ qmblk->dq_sb = nd.dentry->d_inode->i_sb; -+ err = vzquota_get_super(qmblk->dq_sb); -+ if (err) -+ goto out_super; -+ -+ /* -+ * Serialization with quota initialization and operations is performed -+ * through generation check: generation is memorized before qmblk is -+ * found and compared under inode_qmblk_lock with assignment. -+ * -+ * Note that the dentry tree is shrunk only for high-level logical -+ * serialization, purely as a courtesy to the user: to have consistent -+ * quota statistics, files should be closed etc. on quota on. -+ */ -+ err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_dentry->d_inode, -+ qmblk); -+ if (err) -+ goto out_init; -+ qmblk->dq_state = VZDQ_WORKING; -+ -+ up(&vz_quota_sem); -+ return 0; -+ -+out_init: -+ dqsb = qmblk->dq_sb; -+out_super: -+ /* clear for qmblk_put/quota_free_master */ -+ qmblk->dq_sb = NULL; -+ qmblk->dq_root_dentry = NULL; -+ qmblk->dq_root_mnt = NULL; -+out_path: -+ path_release(&nd); -+out: -+ if (dqsb) -+ vzquota_put_super(dqsb); -+ up(&vz_quota_sem); -+ return err; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * Switching quota off. -+ * --------------------------------------------------------------------- */ -+ -+/* -+ * destroy quota block by ID -+ */ -+static int vzquota_destroy(unsigned int quota_id) -+{ -+ int err; -+ struct vz_quota_master *qmblk; -+ struct dentry *dentry; -+ struct vfsmount *mnt; -+ -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = -EBUSY; -+ if (qmblk->dq_state == VZDQ_WORKING) -+ goto out; /* quota_off first */ -+ -+ list_del_init(&qmblk->dq_hash); -+ dentry = qmblk->dq_root_dentry; -+ qmblk->dq_root_dentry = NULL; -+ mnt = qmblk->dq_root_mnt; -+ qmblk->dq_root_mnt = NULL; -+ -+ if (qmblk->dq_sb) -+ vzquota_put_super(qmblk->dq_sb); -+ up(&vz_quota_sem); -+ -+ qmblk_put(qmblk); -+ dput(dentry); -+ mntput(mnt); -+ return 0; -+ -+out: -+ up(&vz_quota_sem); -+ return err; -+} -+ -+/** -+ * vzquota_off - turn quota off -+ */ -+ -+static int __vzquota_sync_list(struct list_head *lh, -+ struct vz_quota_master *qmblk, -+ enum writeback_sync_modes sync_mode) -+{ -+ struct writeback_control wbc; -+ LIST_HEAD(list); -+ struct vz_quota_ilink *qlnk; -+ struct inode *inode; -+ int err; -+ -+ memset(&wbc, 0, sizeof(wbc)); -+ wbc.sync_mode = sync_mode; -+ -+ err = 0; -+ while (!list_empty(lh) && !err) { -+ if (need_resched()) { -+ inode_qmblk_unlock(qmblk->dq_sb); -+ schedule(); -+ inode_qmblk_lock(qmblk->dq_sb); -+ } -+ -+ qlnk = list_first_entry(lh, struct vz_quota_ilink, list); -+ list_move(&qlnk->list, &list); -+ -+ inode = igrab(QLNK_INODE(qlnk)); -+ if (!inode) -+ continue; -+ -+ inode_qmblk_unlock(qmblk->dq_sb); -+ -+ wbc.nr_to_write = LONG_MAX; -+ err = sync_inode(inode, &wbc); -+ iput(inode); -+ -+ inode_qmblk_lock(qmblk->dq_sb); -+ } -+ -+ list_splice(&list, lh); -+ return err; -+} -+ -+static int vzquota_sync_list(struct list_head *lh, -+ struct vz_quota_master *qmblk) -+{ -+ int err; -+ -+ err = __vzquota_sync_list(lh, qmblk, WB_SYNC_NONE); -+ if (err) -+ return err; -+ -+ err = __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL); -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+static int vzquota_sync_inodes(struct vz_quota_master *qmblk) -+{ -+ int err; -+ LIST_HEAD(qlnk_list); -+ -+ list_splice_init(&qmblk->dq_ilink_list, &qlnk_list); -+ err = vzquota_sync_list(&qlnk_list, qmblk); -+ if (!err && !list_empty(&qmblk->dq_ilink_list)) -+ err = -EBUSY; -+ list_splice(&qlnk_list, &qmblk->dq_ilink_list); -+ -+ return err; -+} -+ -+static int vzquota_off(unsigned int quota_id) -+{ -+ int err; -+ struct vz_quota_master *qmblk; -+ -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = -EALREADY; -+ if (qmblk->dq_state != VZDQ_WORKING) -+ goto out; -+ -+ inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */ -+ err = vzquota_sync_inodes(qmblk); -+ if (err) -+ goto out_unlock; -+ inode_qmblk_unlock(qmblk->dq_sb); -+ -+ err = vzquota_off_qmblk(qmblk->dq_sb, qmblk); -+ if (err) -+ goto out; -+ -+ /* vzquota_destroy will free resources */ -+ qmblk->dq_state = VZDQ_STOPING; -+out: -+ up(&vz_quota_sem); -+ -+ return err; -+ -+out_unlock: -+ inode_qmblk_unlock(qmblk->dq_sb); -+ goto out; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * Other VZQUOTA ioctl's. -+ * --------------------------------------------------------------------- */ -+ -+/* -+ * this function should: -+ * - set new limits/buffer under quota master block lock -+ * - if new softlimit less then usage, then set expiration time -+ * - no need to alloc ugid hash table - we'll do that on demand -+ */ -+int vzquota_update_limit(struct dq_stat *_qstat, -+ struct dq_stat *qstat) -+{ -+ int err; -+ -+ err = -EINVAL; -+ if (vzquota_check_sane_limits(qstat)) -+ goto out; -+ -+ err = 0; -+ -+ /* limits */ -+ _qstat->bsoftlimit = qstat->bsoftlimit; -+ _qstat->bhardlimit = qstat->bhardlimit; -+ /* -+ * If the soft limit is exceeded, administrator can override the moment -+ * when the grace period for limit exceeding ends. -+ * Specifying the moment may be useful if the soft limit is set to be -+ * lower than the current usage. In the latter case, if the grace -+ * period end isn't specified, the grace period will start from the -+ * moment of the first write operation. -+ * There is a race with the user level. Soft limit may be already -+ * exceeded before the limit change, and grace period end calculated by -+ * the kernel will be overriden. User level may check if the limit is -+ * already exceeded, but check and set calls are not atomic. -+ * This race isn't dangerous. Under normal cicrumstances, the -+ * difference between the grace period end calculated by the kernel and -+ * the user level should be not greater than as the difference between -+ * the moments of check and set calls, i.e. not bigger than the quota -+ * timer resolution - 1 sec. -+ */ -+ if (qstat->btime != (time_t)0 && -+ _qstat->bcurrent >= _qstat->bsoftlimit) -+ _qstat->btime = qstat->btime; -+ -+ _qstat->isoftlimit = qstat->isoftlimit; -+ _qstat->ihardlimit = qstat->ihardlimit; -+ if (qstat->itime != (time_t)0 && -+ _qstat->icurrent >= _qstat->isoftlimit) -+ _qstat->itime = qstat->itime; -+ -+out: -+ return err; -+} -+ -+/* -+ * set new quota limits. -+ * this function should: -+ * copy new limits from user level -+ * - find quota block -+ * - set new limits and flags. -+ */ -+static int vzquota_setlimit(unsigned int quota_id, -+ struct vz_quota_stat *u_qstat) -+{ -+ int err; -+ struct vz_quota_stat qstat; -+ struct vz_quota_master *qmblk; -+ -+ down(&vz_quota_sem); /* for hash list protection */ -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = -EFAULT; -+ if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) -+ goto out; -+ -+ qmblk_data_write_lock(qmblk); -+ err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat); -+ if (err == 0) -+ qmblk->dq_info = qstat.dq_info; -+ qmblk_data_write_unlock(qmblk); -+ -+out: -+ up(&vz_quota_sem); -+ return err; -+} -+ -+/* -+ * get quota limits. -+ * very simple - just return stat buffer to user -+ */ -+static int vzquota_getstat(unsigned int quota_id, -+ struct vz_quota_stat *u_qstat) -+{ -+ int err; -+ struct vz_quota_stat qstat; -+ struct vz_quota_master *qmblk; -+ -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ qmblk_data_read_lock(qmblk); -+ /* copy whole buffer under lock */ -+ memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat)); -+ memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info)); -+ qmblk_data_read_unlock(qmblk); -+ -+ err = copy_to_user(u_qstat, &qstat, sizeof(qstat)); -+ if (err) -+ err = -EFAULT; -+ -+out: -+ up(&vz_quota_sem); -+ return err; -+} -+ -+/* -+ * This is a system call to turn per-VE disk quota on. -+ * Note this call is allowed to run ONLY from VE0 -+ */ -+long do_vzquotactl(int cmd, unsigned int quota_id, -+ struct vz_quota_stat *qstat, const char *ve_root) -+{ -+ int ret; -+ -+ ret = -EPERM; -+ /* access allowed only from root of VE0 */ -+ if (!capable(CAP_SYS_RESOURCE) || -+ !capable(CAP_SYS_ADMIN)) -+ goto out; -+ -+ switch (cmd) { -+ case VZ_DQ_CREATE: -+ ret = vzquota_create(quota_id, qstat); -+ break; -+ case VZ_DQ_DESTROY: -+ ret = vzquota_destroy(quota_id); -+ break; -+ case VZ_DQ_ON: -+ ret = vzquota_on(quota_id, ve_root); -+ break; -+ case VZ_DQ_OFF: -+ ret = vzquota_off(quota_id); -+ break; -+ case VZ_DQ_SETLIMIT: -+ ret = vzquota_setlimit(quota_id, qstat); -+ break; -+ case VZ_DQ_GETSTAT: -+ ret = vzquota_getstat(quota_id, qstat); -+ break; -+ -+ default: -+ ret = -EINVAL; -+ goto out; -+ } -+ -+out: -+ return ret; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * Proc filesystem routines -+ * ---------------------------------------------------------------------*/ -+ -+#if defined(CONFIG_PROC_FS) -+ -+#define QUOTA_UINT_LEN 15 -+#define QUOTA_TIME_LEN_FMT_UINT "%11u" -+#define QUOTA_NUM_LEN_FMT_UINT "%15u" -+#define QUOTA_NUM_LEN_FMT_ULL "%15Lu" -+#define QUOTA_TIME_LEN_FMT_STR "%11s" -+#define QUOTA_NUM_LEN_FMT_STR "%15s" -+#define QUOTA_PROC_MAX_LINE_LEN 2048 -+ -+/* -+ * prints /proc/ve_dq header line -+ */ -+static int print_proc_header(char * buffer) -+{ -+ return sprintf(buffer, -+ "%-11s" -+ QUOTA_NUM_LEN_FMT_STR -+ QUOTA_NUM_LEN_FMT_STR -+ QUOTA_NUM_LEN_FMT_STR -+ QUOTA_TIME_LEN_FMT_STR -+ QUOTA_TIME_LEN_FMT_STR -+ "\n", -+ "qid: path", -+ "usage", "softlimit", "hardlimit", "time", "expire"); -+} -+ -+/* -+ * prints proc master record id, dentry path -+ */ -+static int print_proc_master_id(char * buffer, char * path_buf, -+ struct vz_quota_master * qp) -+{ -+ char *path; -+ int over; -+ -+ path = NULL; -+ switch (qp->dq_state) { -+ case VZDQ_WORKING: -+ if (!path_buf) { -+ path = ""; -+ break; -+ } -+ path = d_path(qp->dq_root_dentry, -+ qp->dq_root_mnt, path_buf, PAGE_SIZE); -+ if (IS_ERR(path)) { -+ path = ""; -+ break; -+ } -+ /* do not print large path, truncate it */ -+ over = strlen(path) - -+ (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 - -+ QUOTA_UINT_LEN); -+ if (over > 0) { -+ path += over - 3; -+ path[0] = path[1] = path[3] = '.'; -+ } -+ break; -+ case VZDQ_STARTING: -+ path = "-- started --"; -+ break; -+ case VZDQ_STOPING: -+ path = "-- stopped --"; -+ break; -+ } -+ -+ return sprintf(buffer, "%u: %s\n", qp->dq_id, path); -+} -+ -+/* -+ * prints struct vz_quota_stat data -+ */ -+static int print_proc_stat(char * buffer, struct dq_stat *qs, -+ struct dq_info *qi) -+{ -+ return sprintf(buffer, -+ "%11s" -+ QUOTA_NUM_LEN_FMT_ULL -+ QUOTA_NUM_LEN_FMT_ULL -+ QUOTA_NUM_LEN_FMT_ULL -+ QUOTA_TIME_LEN_FMT_UINT -+ QUOTA_TIME_LEN_FMT_UINT -+ "\n" -+ "%11s" -+ QUOTA_NUM_LEN_FMT_UINT -+ QUOTA_NUM_LEN_FMT_UINT -+ QUOTA_NUM_LEN_FMT_UINT -+ QUOTA_TIME_LEN_FMT_UINT -+ QUOTA_TIME_LEN_FMT_UINT -+ "\n", -+ "1k-blocks", -+ qs->bcurrent >> 10, -+ qs->bsoftlimit >> 10, -+ qs->bhardlimit >> 10, -+ (unsigned int)qs->btime, -+ (unsigned int)qi->bexpire, -+ "inodes", -+ qs->icurrent, -+ qs->isoftlimit, -+ qs->ihardlimit, -+ (unsigned int)qs->itime, -+ (unsigned int)qi->iexpire); -+} -+ -+ -+/* -+ * for /proc filesystem output -+ */ -+static int vzquota_read_proc(char *page, char **start, off_t off, int count, -+ int *eof, void *data) -+{ -+ int len, i; -+ off_t printed = 0; -+ char *p = page; -+ struct vz_quota_master *qp; -+ struct vz_quota_ilink *ql2; -+ struct list_head *listp; -+ char *path_buf; -+ -+ path_buf = (char*)__get_free_page(GFP_KERNEL); -+ if (path_buf == NULL) -+ return -ENOMEM; -+ -+ len = print_proc_header(p); -+ printed += len; -+ if (off < printed) /* keep header in output */ { -+ *start = p + off; -+ p += len; -+ } -+ -+ down(&vz_quota_sem); -+ -+ /* traverse master hash table for all records */ -+ for (i = 0; i < vzquota_hash_size; i++) { -+ list_for_each(listp, &vzquota_hash_table[i]) { -+ qp = list_entry(listp, -+ struct vz_quota_master, dq_hash); -+ -+ /* Skip other VE's information if not root of VE0 */ -+ if ((!capable(CAP_SYS_ADMIN) || -+ !capable(CAP_SYS_RESOURCE))) { -+ ql2 = INODE_QLNK(current->fs->root->d_inode); -+ if (ql2 == NULL || qp != ql2->qmblk) -+ continue; -+ } -+ /* -+ * Now print the next record -+ */ -+ len = 0; -+ /* we print quotaid and path only in VE0 */ -+ if (capable(CAP_SYS_ADMIN)) -+ len += print_proc_master_id(p+len,path_buf, qp); -+ len += print_proc_stat(p+len, &qp->dq_stat, -+ &qp->dq_info); -+ printed += len; -+ /* skip unnecessary lines */ -+ if (printed <= off) -+ continue; -+ p += len; -+ /* provide start offset */ -+ if (*start == NULL) -+ *start = p + (off - printed); -+ /* have we printed all requested size? */ -+ if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN || -+ (p - *start) >= count) -+ goto out; -+ } -+ } -+ -+ *eof = 1; /* checked all hash */ -+out: -+ up(&vz_quota_sem); -+ -+ len = 0; -+ if (*start != NULL) { -+ len = (p - *start); -+ if (len > count) -+ len = count; -+ } -+ -+ if (path_buf) -+ free_page((unsigned long) path_buf); -+ -+ return len; -+} -+ -+/* -+ * Register procfs read callback -+ */ -+int vzquota_proc_init(void) -+{ -+ struct proc_dir_entry *de; -+ -+ de = create_proc_entry("vz/vzquota", S_IFREG|S_IRUSR, NULL); -+ if (de == NULL) { -+ /* create "vz" subdirectory, if not exist */ -+ de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); -+ if (de == NULL) -+ goto out_err; -+ de = create_proc_entry("vzquota", S_IFREG|S_IRUSR, de); -+ if (de == NULL) -+ goto out_err; -+ } -+ de->read_proc = vzquota_read_proc; -+ de->data = NULL; -+ return 0; -+out_err: -+ return -EBUSY; -+} -+ -+void vzquota_proc_release(void) -+{ -+ /* Unregister procfs read callback */ -+ remove_proc_entry("vz/vzquota", NULL); -+} -+ -+#endif -diff -upr linux-2.6.16.orig/fs/vzdq_ops.c linux-2.6.16-026test009/fs/vzdq_ops.c ---- linux-2.6.16.orig/fs/vzdq_ops.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/fs/vzdq_ops.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,565 @@ -+/* -+ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ */ -+ -+#include <linux/config.h> -+#include <linux/kernel.h> -+#include <linux/types.h> -+#include <asm/semaphore.h> -+#include <linux/sched.h> -+#include <linux/fs.h> -+#include <linux/quota.h> -+#include <linux/vzquota.h> -+ -+ -+/* ---------------------------------------------------------------------- -+ * Quota superblock operations - helper functions. -+ * --------------------------------------------------------------------- */ -+ -+static inline void vzquota_incr_inodes(struct dq_stat *dqstat, -+ unsigned long number) -+{ -+ dqstat->icurrent += number; -+} -+ -+static inline void vzquota_incr_space(struct dq_stat *dqstat, -+ __u64 number) -+{ -+ dqstat->bcurrent += number; -+} -+ -+static inline void vzquota_decr_inodes(struct dq_stat *dqstat, -+ unsigned long number) -+{ -+ if (dqstat->icurrent > number) -+ dqstat->icurrent -= number; -+ else -+ dqstat->icurrent = 0; -+ if (dqstat->icurrent < dqstat->isoftlimit) -+ dqstat->itime = (time_t) 0; -+} -+ -+static inline void vzquota_decr_space(struct dq_stat *dqstat, -+ __u64 number) -+{ -+ if (dqstat->bcurrent > number) -+ dqstat->bcurrent -= number; -+ else -+ dqstat->bcurrent = 0; -+ if (dqstat->bcurrent < dqstat->bsoftlimit) -+ dqstat->btime = (time_t) 0; -+} -+ -+/* -+ * better printk() message or use /proc/vzquotamsg interface -+ * similar to /proc/kmsg -+ */ -+static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag, -+ const char *fmt) -+{ -+ if (dq_info->flags & flag) /* warning already printed for this -+ masterblock */ -+ return; -+ printk(fmt, dq_id); -+ dq_info->flags |= flag; -+} -+ -+/* -+ * ignore_hardlimit - -+ * -+ * Intended to allow superuser of VE0 to overwrite hardlimits. -+ * -+ * ignore_hardlimit() has a very bad feature: -+ * -+ * writepage() operation for writable mapping of a file with holes -+ * may trigger get_block() with wrong current and as a consequence, -+ * opens a possibility to overcommit hardlimits -+ */ -+/* for the reason above, it is disabled now */ -+static inline int ignore_hardlimit(struct dq_info *dqstat) -+{ -+#if 0 -+ return ve_is_super(get_exec_env()) && -+ capable(CAP_SYS_RESOURCE) && -+ (dqstat->options & VZ_QUOTA_OPT_RSQUASH); -+#else -+ return 0; -+#endif -+} -+ -+static int vzquota_check_inodes(struct dq_info *dq_info, -+ struct dq_stat *dqstat, -+ unsigned long number, int dq_id) -+{ -+ if (number == 0) -+ return QUOTA_OK; -+ -+ if (dqstat->icurrent + number > dqstat->ihardlimit && -+ !ignore_hardlimit(dq_info)) { -+ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, -+ "VZ QUOTA: file hardlimit reached for id=%d\n"); -+ return NO_QUOTA; -+ } -+ -+ if (dqstat->icurrent + number > dqstat->isoftlimit) { -+ if (dqstat->itime == (time_t)0) { -+ vzquota_warn(dq_info, dq_id, 0, -+ "VZ QUOTA: file softlimit exceeded " -+ "for id=%d\n"); -+ dqstat->itime = CURRENT_TIME_SECONDS + -+ dq_info->iexpire; -+ } else if (CURRENT_TIME_SECONDS >= dqstat->itime && -+ !ignore_hardlimit(dq_info)) { -+ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, -+ "VZ QUOTA: file softlimit expired " -+ "for id=%d\n"); -+ return NO_QUOTA; -+ } -+ } -+ -+ return QUOTA_OK; -+} -+ -+static int vzquota_check_space(struct dq_info *dq_info, -+ struct dq_stat *dqstat, -+ __u64 number, int dq_id, char prealloc) -+{ -+ if (number == 0) -+ return QUOTA_OK; -+ -+ if (dqstat->bcurrent + number > dqstat->bhardlimit && -+ !ignore_hardlimit(dq_info)) { -+ if (!prealloc) -+ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, -+ "VZ QUOTA: disk hardlimit reached " -+ "for id=%d\n"); -+ return NO_QUOTA; -+ } -+ -+ if (dqstat->bcurrent + number > dqstat->bsoftlimit) { -+ if (dqstat->btime == (time_t)0) { -+ if (!prealloc) { -+ vzquota_warn(dq_info, dq_id, 0, -+ "VZ QUOTA: disk softlimit exceeded " -+ "for id=%d\n"); -+ dqstat->btime = CURRENT_TIME_SECONDS -+ + dq_info->bexpire; -+ } else { -+ /* -+ * Original Linux quota doesn't allow -+ * preallocation to exceed softlimit so -+ * exceeding will be always printed -+ */ -+ return NO_QUOTA; -+ } -+ } else if (CURRENT_TIME_SECONDS >= dqstat->btime && -+ !ignore_hardlimit(dq_info)) { -+ if (!prealloc) -+ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, -+ "VZ QUOTA: disk quota " -+ "softlimit expired " -+ "for id=%d\n"); -+ return NO_QUOTA; -+ } -+ } -+ -+ return QUOTA_OK; -+} -+ -+static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk, -+ struct vz_quota_ugid *qugid[], -+ int type, unsigned long number) -+{ -+ struct dq_info *dqinfo; -+ struct dq_stat *dqstat; -+ -+ if (qugid[type] == NULL) -+ return QUOTA_OK; -+ if (qugid[type] == VZ_QUOTA_UGBAD) -+ return NO_QUOTA; -+ -+ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) -+ return QUOTA_OK; -+ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) -+ return QUOTA_OK; -+ if (number == 0) -+ return QUOTA_OK; -+ -+ dqinfo = &qmblk->dq_ugid_info[type]; -+ dqstat = &qugid[type]->qugid_stat; -+ -+ if (dqstat->ihardlimit != 0 && -+ dqstat->icurrent + number > dqstat->ihardlimit) -+ return NO_QUOTA; -+ -+ if (dqstat->isoftlimit != 0 && -+ dqstat->icurrent + number > dqstat->isoftlimit) { -+ if (dqstat->itime == (time_t)0) -+ dqstat->itime = CURRENT_TIME_SECONDS + -+ dqinfo->iexpire; -+ else if (CURRENT_TIME_SECONDS >= dqstat->itime) -+ return NO_QUOTA; -+ } -+ -+ return QUOTA_OK; -+} -+ -+static int vzquota_check_ugid_space(struct vz_quota_master *qmblk, -+ struct vz_quota_ugid *qugid[], -+ int type, __u64 number, char prealloc) -+{ -+ struct dq_info *dqinfo; -+ struct dq_stat *dqstat; -+ -+ if (qugid[type] == NULL) -+ return QUOTA_OK; -+ if (qugid[type] == VZ_QUOTA_UGBAD) -+ return NO_QUOTA; -+ -+ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) -+ return QUOTA_OK; -+ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) -+ return QUOTA_OK; -+ if (number == 0) -+ return QUOTA_OK; -+ -+ dqinfo = &qmblk->dq_ugid_info[type]; -+ dqstat = &qugid[type]->qugid_stat; -+ -+ if (dqstat->bhardlimit != 0 && -+ dqstat->bcurrent + number > dqstat->bhardlimit) -+ return NO_QUOTA; -+ -+ if (dqstat->bsoftlimit != 0 && -+ dqstat->bcurrent + number > dqstat->bsoftlimit) { -+ if (dqstat->btime == (time_t)0) { -+ if (!prealloc) -+ dqstat->btime = CURRENT_TIME_SECONDS -+ + dqinfo->bexpire; -+ else -+ /* -+ * Original Linux quota doesn't allow -+ * preallocation to exceed softlimit so -+ * exceeding will be always printed -+ */ -+ return NO_QUOTA; -+ } else if (CURRENT_TIME_SECONDS >= dqstat->btime) -+ return NO_QUOTA; -+ } -+ -+ return QUOTA_OK; -+} -+ -+/* ---------------------------------------------------------------------- -+ * Quota superblock operations -+ * --------------------------------------------------------------------- */ -+ -+/* -+ * S_NOQUOTA note. -+ * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for -+ * - quota file (absent in our case) -+ * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like -+ * filesystem-specific new_inode, before the inode gets outside links. -+ * For the latter case, the only quota operation where care about S_NOQUOTA -+ * might be required is vzquota_drop, but there S_NOQUOTA has already been -+ * checked in DQUOT_DROP(). -+ * So, S_NOQUOTA may be ignored for now in the VZDQ code. -+ * -+ * The above note is not entirely correct. -+ * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from -+ * delete_inode if new_inode fails (for example, because of inode quota -+ * limits), so S_NOQUOTA check is needed in free_inode. -+ * This seems to be the dark corner of the current quota API. -+ */ -+ -+/* -+ * Initialize quota operations for the specified inode. -+ */ -+static int vzquota_initialize(struct inode *inode, int type) -+{ -+ vzquota_inode_init_call(inode); -+ return 0; /* ignored by caller */ -+} -+ -+/* -+ * Release quota for the specified inode. -+ */ -+static int vzquota_drop(struct inode *inode) -+{ -+ vzquota_inode_drop_call(inode); -+ return 0; /* ignored by caller */ -+} -+ -+/* -+ * Allocate block callback. -+ * -+ * If (prealloc) disk quota exceeding warning is not printed. -+ * See Linux quota to know why. -+ * -+ * Return: -+ * QUOTA_OK == 0 on SUCCESS -+ * NO_QUOTA == 1 if allocation should fail -+ */ -+static int vzquota_alloc_space(struct inode *inode, -+ qsize_t number, int prealloc) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_datast data; -+ int ret = QUOTA_OK; -+ -+ qmblk = vzquota_inode_data(inode, &data); -+ if (qmblk == VZ_QUOTA_BAD) -+ return NO_QUOTA; -+ if (qmblk != NULL) { -+#ifdef CONFIG_VZ_QUOTA_UGID -+ int cnt; -+ struct vz_quota_ugid * qugid[MAXQUOTAS]; -+#endif -+ -+ /* checking first */ -+ ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat, -+ number, qmblk->dq_id, prealloc); -+ if (ret == NO_QUOTA) -+ goto no_quota; -+#ifdef CONFIG_VZ_QUOTA_UGID -+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -+ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; -+ ret = vzquota_check_ugid_space(qmblk, qugid, -+ cnt, number, prealloc); -+ if (ret == NO_QUOTA) -+ goto no_quota; -+ } -+ /* check ok, may increment */ -+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -+ if (qugid[cnt] == NULL) -+ continue; -+ vzquota_incr_space(&qugid[cnt]->qugid_stat, number); -+ } -+#endif -+ vzquota_incr_space(&qmblk->dq_stat, number); -+ vzquota_data_unlock(inode, &data); -+ } -+ -+ inode_add_bytes(inode, number); -+ might_sleep(); -+ return QUOTA_OK; -+ -+no_quota: -+ vzquota_data_unlock(inode, &data); -+ return NO_QUOTA; -+} -+ -+/* -+ * Allocate inodes callback. -+ * -+ * Return: -+ * QUOTA_OK == 0 on SUCCESS -+ * NO_QUOTA == 1 if allocation should fail -+ */ -+static int vzquota_alloc_inode(const struct inode *inode, unsigned long number) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_datast data; -+ int ret = QUOTA_OK; -+ -+ qmblk = vzquota_inode_data((struct inode *)inode, &data); -+ if (qmblk == VZ_QUOTA_BAD) -+ return NO_QUOTA; -+ if (qmblk != NULL) { -+#ifdef CONFIG_VZ_QUOTA_UGID -+ int cnt; -+ struct vz_quota_ugid *qugid[MAXQUOTAS]; -+#endif -+ -+ /* checking first */ -+ ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat, -+ number, qmblk->dq_id); -+ if (ret == NO_QUOTA) -+ goto no_quota; -+#ifdef CONFIG_VZ_QUOTA_UGID -+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -+ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; -+ ret = vzquota_check_ugid_inodes(qmblk, qugid, -+ cnt, number); -+ if (ret == NO_QUOTA) -+ goto no_quota; -+ } -+ /* check ok, may increment */ -+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -+ if (qugid[cnt] == NULL) -+ continue; -+ vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number); -+ } -+#endif -+ vzquota_incr_inodes(&qmblk->dq_stat, number); -+ vzquota_data_unlock((struct inode *)inode, &data); -+ } -+ -+ might_sleep(); -+ return QUOTA_OK; -+ -+no_quota: -+ vzquota_data_unlock((struct inode *)inode, &data); -+ return NO_QUOTA; -+} -+ -+/* -+ * Free space callback. -+ */ -+static int vzquota_free_space(struct inode *inode, qsize_t number) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_datast data; -+ -+ qmblk = vzquota_inode_data(inode, &data); -+ if (qmblk == VZ_QUOTA_BAD) -+ return NO_QUOTA; /* isn't checked by the caller */ -+ if (qmblk != NULL) { -+#ifdef CONFIG_VZ_QUOTA_UGID -+ int cnt; -+ struct vz_quota_ugid * qugid; -+#endif -+ -+ vzquota_decr_space(&qmblk->dq_stat, number); -+#ifdef CONFIG_VZ_QUOTA_UGID -+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -+ qugid = INODE_QLNK(inode)->qugid[cnt]; -+ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) -+ continue; -+ vzquota_decr_space(&qugid->qugid_stat, number); -+ } -+#endif -+ vzquota_data_unlock(inode, &data); -+ } -+ inode_sub_bytes(inode, number); -+ might_sleep(); -+ return QUOTA_OK; -+} -+ -+/* -+ * Free inodes callback. -+ */ -+static int vzquota_free_inode(const struct inode *inode, unsigned long number) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_datast data; -+ -+ if (IS_NOQUOTA(inode)) -+ return QUOTA_OK; -+ -+ qmblk = vzquota_inode_data((struct inode *)inode, &data); -+ if (qmblk == VZ_QUOTA_BAD) -+ return NO_QUOTA; -+ if (qmblk != NULL) { -+#ifdef CONFIG_VZ_QUOTA_UGID -+ int cnt; -+ struct vz_quota_ugid * qugid; -+#endif -+ -+ vzquota_decr_inodes(&qmblk->dq_stat, number); -+#ifdef CONFIG_VZ_QUOTA_UGID -+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -+ qugid = INODE_QLNK(inode)->qugid[cnt]; -+ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) -+ continue; -+ vzquota_decr_inodes(&qugid->qugid_stat, number); -+ } -+#endif -+ vzquota_data_unlock((struct inode *)inode, &data); -+ } -+ might_sleep(); -+ return QUOTA_OK; -+} -+ -+#if defined(CONFIG_VZ_QUOTA_UGID) -+ -+/* -+ * helper function for quota_transfer -+ * check that we can add inode to this quota_id -+ */ -+static int vzquota_transfer_check(struct vz_quota_master *qmblk, -+ struct vz_quota_ugid *qugid[], -+ unsigned int type, __u64 size) -+{ -+ if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK || -+ vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK) -+ return -1; -+ return 0; -+} -+ -+int vzquota_transfer_usage(struct inode *inode, -+ int mask, -+ struct vz_quota_ilink *qlnk) -+{ -+ struct vz_quota_ugid *qugid_old; -+ __u64 space; -+ int i; -+ -+ space = inode_get_bytes(inode); -+ for (i = 0; i < MAXQUOTAS; i++) { -+ if (!(mask & (1 << i))) -+ continue; -+ if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space)) -+ return -1; -+ } -+ -+ for (i = 0; i < MAXQUOTAS; i++) { -+ if (!(mask & (1 << i))) -+ continue; -+ qugid_old = INODE_QLNK(inode)->qugid[i]; -+ vzquota_decr_space(&qugid_old->qugid_stat, space); -+ vzquota_decr_inodes(&qugid_old->qugid_stat, 1); -+ vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space); -+ vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1); -+ } -+ return 0; -+} -+ -+/* -+ * Transfer the inode between diffent user/group quotas. -+ */ -+static int vzquota_transfer(struct inode *inode, struct iattr *iattr) -+{ -+ return vzquota_inode_transfer_call(inode, iattr) ? -+ NO_QUOTA : QUOTA_OK; -+} -+ -+#else /* CONFIG_VZ_QUOTA_UGID */ -+ -+static int vzquota_transfer(struct inode *inode, struct iattr *iattr) -+{ -+ return QUOTA_OK; -+} -+ -+#endif -+ -+/* -+ * Called under following semaphores: -+ * old_d->d_inode->i_sb->s_vfs_rename_sem -+ * old_d->d_inode->i_sem -+ * new_d->d_inode->i_sem -+ * [not verified --SAW] -+ */ -+static int vzquota_rename(struct inode *inode, -+ struct inode *old_dir, struct inode *new_dir) -+{ -+ return vzquota_rename_check(inode, old_dir, new_dir) ? -+ NO_QUOTA : QUOTA_OK; -+} -+ -+/* -+ * Structure of superblock diskquota operations. -+ */ -+struct dquot_operations vz_quota_operations = { -+ initialize: vzquota_initialize, -+ drop: vzquota_drop, -+ alloc_space: vzquota_alloc_space, -+ alloc_inode: vzquota_alloc_inode, -+ free_space: vzquota_free_space, -+ free_inode: vzquota_free_inode, -+ transfer: vzquota_transfer, -+ rename: vzquota_rename -+}; -diff -upr linux-2.6.16.orig/fs/vzdq_tree.c linux-2.6.16-026test009/fs/vzdq_tree.c ---- linux-2.6.16.orig/fs/vzdq_tree.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/fs/vzdq_tree.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,286 @@ -+/* -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * This file contains Virtuozzo quota tree implementation -+ */ -+ -+#include <linux/errno.h> -+#include <linux/slab.h> -+#include <linux/vzdq_tree.h> -+ -+struct quotatree_tree *quotatree_alloc(void) -+{ -+ int l; -+ struct quotatree_tree *tree; -+ -+ tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL); -+ if (tree == NULL) -+ goto out; -+ -+ for (l = 0; l < QUOTATREE_DEPTH; l++) { -+ INIT_LIST_HEAD(&tree->levels[l].usedlh); -+ INIT_LIST_HEAD(&tree->levels[l].freelh); -+ tree->levels[l].freenum = 0; -+ } -+ tree->root = NULL; -+ tree->leaf_num = 0; -+out: -+ return tree; -+} -+ -+static struct quotatree_node * -+quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level, -+ struct quotatree_find_state *st) -+{ -+ void **block; -+ struct quotatree_node *parent; -+ int l, index; -+ -+ parent = NULL; -+ block = (void **)&tree->root; -+ l = 0; -+ while (l < level && *block != NULL) { -+ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; -+ parent = *block; -+ block = parent->blocks + index; -+ l++; -+ } -+ if (st != NULL) { -+ st->block = block; -+ st->level = l; -+ } -+ -+ return parent; -+} -+ -+void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, -+ struct quotatree_find_state *st) -+{ -+ quotatree_follow(tree, id, QUOTATREE_DEPTH, st); -+ if (st->level == QUOTATREE_DEPTH) -+ return *st->block; -+ else -+ return NULL; -+} -+ -+void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index) -+{ -+ int i, count; -+ struct quotatree_node *p; -+ void *leaf; -+ -+ if (QTREE_LEAFNUM(tree) <= index) -+ return NULL; -+ -+ count = 0; -+ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { -+ for (i = 0; i < QUOTATREE_BSIZE; i++) { -+ leaf = p->blocks[i]; -+ if (leaf == NULL) -+ continue; -+ if (count == index) -+ return leaf; -+ count++; -+ } -+ } -+ return NULL; -+} -+ -+/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id) -+ * in the tree... */ -+void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id) -+{ -+ int off; -+ struct quotatree_node *parent, *p; -+ struct list_head *lh; -+ -+ /* get parent refering correct quota tree node of the last level */ -+ parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL); -+ if (!parent) -+ return NULL; -+ -+ off = (id & QUOTATREE_BMASK) + 1; /* next ugid */ -+ lh = &parent->list; -+ do { -+ p = list_entry(lh, struct quotatree_node, list); -+ for ( ; off < QUOTATREE_BSIZE; off++) -+ if (p->blocks[off]) -+ return p->blocks[off]; -+ off = 0; -+ lh = lh->next; -+ } while (lh != &QTREE_LEAFLVL(tree)->usedlh); -+ -+ return NULL; -+} -+ -+int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, -+ struct quotatree_find_state *st, void *data) -+{ -+ struct quotatree_node *p; -+ int l, index; -+ -+ while (st->level < QUOTATREE_DEPTH) { -+ l = st->level; -+ if (!list_empty(&tree->levels[l].freelh)) { -+ p = list_entry(tree->levels[l].freelh.next, -+ struct quotatree_node, list); -+ list_del(&p->list); -+ } else { -+ p = kmalloc(sizeof(struct quotatree_node), GFP_KERNEL); -+ if (p == NULL) -+ return -ENOMEM; -+ /* save block number in the l-level -+ * it uses for quota file generation */ -+ p->num = tree->levels[l].freenum++; -+ } -+ list_add(&p->list, &tree->levels[l].usedlh); -+ memset(p->blocks, 0, sizeof(p->blocks)); -+ *st->block = p; -+ -+ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; -+ st->block = p->blocks + index; -+ st->level++; -+ } -+ tree->leaf_num++; -+ *st->block = data; -+ -+ return 0; -+} -+ -+static struct quotatree_node * -+quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id, -+ int level) -+{ -+ struct quotatree_node *parent; -+ struct quotatree_find_state st; -+ -+ parent = quotatree_follow(tree, id, level, &st); -+ if (st.level == QUOTATREE_DEPTH) -+ tree->leaf_num--; -+ *st.block = NULL; -+ return parent; -+} -+ -+void quotatree_remove(struct quotatree_tree *tree, quotaid_t id) -+{ -+ struct quotatree_node *p; -+ int level, i; -+ -+ p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH); -+ for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) { -+ for (i = 0; i < QUOTATREE_BSIZE; i++) -+ if (p->blocks[i] != NULL) -+ return; -+ list_move(&p->list, &tree->levels[level].freelh); -+ p = quotatree_remove_ptr(tree, id, level); -+ } -+} -+ -+#if 0 -+static void quotatree_walk(struct quotatree_tree *tree, -+ struct quotatree_node *node_start, -+ quotaid_t id_start, -+ int level_start, int level_end, -+ int (*callback)(struct quotatree_tree *, -+ quotaid_t id, -+ int level, -+ void *ptr, -+ void *data), -+ void *data) -+{ -+ struct quotatree_node *p; -+ int l, shift, index; -+ quotaid_t id; -+ struct quotatree_find_state st; -+ -+ p = node_start; -+ l = level_start; -+ shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; -+ id = id_start; -+ index = 0; -+ -+ /* -+ * Invariants: -+ * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; -+ * id & ((1 << shift) - 1) == 0 -+ * p is l-level node corresponding to id -+ */ -+ do { -+ if (!p) -+ break; -+ -+ if (l < level_end) { -+ for (; index < QUOTATREE_BSIZE; index++) -+ if (p->blocks[index] != NULL) -+ break; -+ if (index < QUOTATREE_BSIZE) { -+ /* descend */ -+ p = p->blocks[index]; -+ l++; -+ shift -= QUOTAID_BBITS; -+ id += (quotaid_t)index << shift; -+ index = 0; -+ continue; -+ } -+ } -+ -+ if ((*callback)(tree, id, l, p, data)) -+ break; -+ -+ /* ascend and to the next node */ -+ p = quotatree_follow(tree, id, l, &st); -+ -+ index = ((id >> shift) & QUOTATREE_BMASK) + 1; -+ l--; -+ shift += QUOTAID_BBITS; -+ id &= ~(((quotaid_t)1 << shift) - 1); -+ } while (l >= level_start); -+} -+#endif -+ -+static void free_list(struct list_head *node_list) -+{ -+ struct quotatree_node *p, *tmp; -+ -+ list_for_each_entry_safe(p, tmp, node_list, list) { -+ list_del(&p->list); -+ kfree(p); -+ } -+} -+ -+static inline void quotatree_free_nodes(struct quotatree_tree *tree) -+{ -+ int i; -+ -+ for (i = 0; i < QUOTATREE_DEPTH; i++) { -+ free_list(&tree->levels[i].usedlh); -+ free_list(&tree->levels[i].freelh); -+ } -+} -+ -+static void quotatree_free_leafs(struct quotatree_tree *tree, -+ void (*dtor)(void *)) -+{ -+ int i; -+ struct quotatree_node *p; -+ -+ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { -+ for (i = 0; i < QUOTATREE_BSIZE; i++) { -+ if (p->blocks[i] == NULL) -+ continue; -+ -+ dtor(p->blocks[i]); -+ } -+ } -+} -+ -+void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)) -+{ -+ quotatree_free_leafs(tree, dtor); -+ quotatree_free_nodes(tree); -+ kfree(tree); -+} -diff -upr linux-2.6.16.orig/fs/vzdq_ugid.c linux-2.6.16-026test009/fs/vzdq_ugid.c ---- linux-2.6.16.orig/fs/vzdq_ugid.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/fs/vzdq_ugid.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,1116 @@ -+/* -+ * Copyright (C) 2002 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * This file contains Virtuozzo UID/GID disk quota implementation -+ */ -+ -+#include <linux/config.h> -+#include <linux/string.h> -+#include <linux/slab.h> -+#include <linux/list.h> -+#include <linux/smp_lock.h> -+#include <linux/rcupdate.h> -+#include <asm/uaccess.h> -+#include <linux/proc_fs.h> -+#include <linux/init.h> -+#include <linux/module.h> -+#include <linux/quota.h> -+#include <linux/quotaio_v2.h> -+#include <linux/virtinfo.h> -+ -+#include <linux/vzctl.h> -+#include <linux/vzctl_quota.h> -+#include <linux/vzquota.h> -+ -+/* -+ * XXX -+ * may be something is needed for sb->s_dquot->info[]? -+ */ -+ -+#define USRQUOTA_MASK (1 << USRQUOTA) -+#define GRPQUOTA_MASK (1 << GRPQUOTA) -+#define QTYPE2MASK(type) (1 << (type)) -+ -+static kmem_cache_t *vz_quota_ugid_cachep; -+ -+/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects -+ * list on the hash table */ -+extern struct semaphore vz_quota_sem; -+ -+inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) -+{ -+ if (qugid != VZ_QUOTA_UGBAD) -+ atomic_inc(&qugid->qugid_count); -+ return qugid; -+} -+ -+/* we don't limit users with zero limits */ -+static inline int vzquota_fake_stat(struct dq_stat *stat) -+{ -+ return stat->bhardlimit == 0 && stat->bsoftlimit == 0 && -+ stat->ihardlimit == 0 && stat->isoftlimit == 0; -+} -+ -+/* callback function for quotatree_free() */ -+static inline void vzquota_free_qugid(void *ptr) -+{ -+ kmem_cache_free(vz_quota_ugid_cachep, ptr); -+} -+ -+/* -+ * destroy ugid, if it have zero refcount, limits and usage -+ * must be called under qmblk->dq_sem -+ */ -+void vzquota_put_ugid(struct vz_quota_master *qmblk, -+ struct vz_quota_ugid *qugid) -+{ -+ if (qugid == VZ_QUOTA_UGBAD) -+ return; -+ qmblk_data_read_lock(qmblk); -+ if (atomic_dec_and_test(&qugid->qugid_count) && -+ (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 && -+ vzquota_fake_stat(&qugid->qugid_stat) && -+ qugid->qugid_stat.bcurrent == 0 && -+ qugid->qugid_stat.icurrent == 0) { -+ quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type), -+ qugid->qugid_id); -+ qmblk->dq_ugid_count--; -+ vzquota_free_qugid(qugid); -+ } -+ qmblk_data_read_unlock(qmblk); -+} -+ -+/* -+ * Get ugid block by its index, like it would present in array. -+ * In reality, this is not array - this is leafs chain of the tree. -+ * NULL if index is out of range. -+ * qmblk semaphore is required to protect the tree. -+ */ -+static inline struct vz_quota_ugid * -+vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type) -+{ -+ return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index); -+} -+ -+/* -+ * get next element from ugid "virtual array" -+ * ugid must be in current array and this array may not be changed between -+ * two accesses (quaranteed by "stopped" quota state and quota semaphore) -+ * qmblk semaphore is required to protect the tree -+ */ -+static inline struct vz_quota_ugid * -+vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid) -+{ -+ return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type), -+ qugid->qugid_id); -+} -+ -+/* -+ * requires dq_sem -+ */ -+struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, -+ unsigned int quota_id, int type, int flags) -+{ -+ struct vz_quota_ugid *qugid; -+ struct quotatree_tree *tree; -+ struct quotatree_find_state st; -+ -+ tree = QUGID_TREE(qmblk, type); -+ qugid = quotatree_find(tree, quota_id, &st); -+ if (qugid) -+ goto success; -+ -+ /* caller does not want alloc */ -+ if (flags & VZDQUG_FIND_DONT_ALLOC) -+ goto fail; -+ -+ if (flags & VZDQUG_FIND_FAKE) -+ goto doit; -+ -+ /* check limit */ -+ if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max) -+ goto fail; -+ -+ /* see comment at VZDQUG_FIXED_SET define */ -+ if (qmblk->dq_flags & VZDQUG_FIXED_SET) -+ goto fail; -+ -+doit: -+ /* alloc new structure */ -+ qugid = kmem_cache_alloc(vz_quota_ugid_cachep, -+ SLAB_NOFS | __GFP_NOFAIL); -+ if (qugid == NULL) -+ goto fail; -+ -+ /* initialize new structure */ -+ qugid->qugid_id = quota_id; -+ memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat)); -+ qugid->qugid_type = type; -+ atomic_set(&qugid->qugid_count, 0); -+ -+ /* insert in tree */ -+ if (quotatree_insert(tree, quota_id, &st, qugid) < 0) -+ goto fail_insert; -+ qmblk->dq_ugid_count++; -+ -+success: -+ vzquota_get_ugid(qugid); -+ return qugid; -+ -+fail_insert: -+ vzquota_free_qugid(qugid); -+fail: -+ return VZ_QUOTA_UGBAD; -+} -+ -+/* -+ * takes dq_sem, may schedule -+ */ -+struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, -+ unsigned int quota_id, int type, int flags) -+{ -+ struct vz_quota_ugid *qugid; -+ -+ down(&qmblk->dq_sem); -+ qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); -+ up(&qmblk->dq_sem); -+ -+ return qugid; -+} -+ -+/* -+ * destroy all ugid records on given quota master -+ */ -+void vzquota_kill_ugid(struct vz_quota_master *qmblk) -+{ -+ BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) || -+ (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL)); -+ -+ if (qmblk->dq_uid_tree != NULL) { -+ quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid); -+ quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid); -+ } -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * Management interface to ugid quota for (super)users. -+ * --------------------------------------------------------------------- */ -+ -+/** -+ * vzquota_find_qmblk - helper to emulate quota on virtual filesystems -+ * -+ * This function finds a quota master block corresponding to the root of -+ * a virtual filesystem. -+ * Returns a quota master block with reference taken, or %NULL if not under -+ * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation -+ * operations will fail). -+ * -+ * Note: this function uses vzquota_inode_qmblk(). -+ * The latter is a rather confusing function: it returns qmblk that used to be -+ * on the inode some time ago (without guarantee that it still has any -+ * relations to the inode). So, vzquota_find_qmblk() leaves it up to the -+ * caller to think whether the inode could have changed its qmblk and what to -+ * do in that case. -+ * Currently, the callers appear to not care :( -+ */ -+struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb) -+{ -+ struct inode *qrinode; -+ struct vz_quota_master *qmblk; -+ -+ qmblk = NULL; -+ qrinode = NULL; -+ if (sb->s_op->get_quota_root != NULL) -+ qrinode = sb->s_op->get_quota_root(sb); -+ if (qrinode != NULL) -+ qmblk = vzquota_inode_qmblk(qrinode); -+ return qmblk; -+} -+ -+static int vzquota_initialize2(struct inode *inode, int type) -+{ -+ return QUOTA_OK; -+} -+ -+static int vzquota_drop2(struct inode *inode) -+{ -+ return QUOTA_OK; -+} -+ -+static int vzquota_alloc_space2(struct inode *inode, -+ qsize_t number, int prealloc) -+{ -+ inode_add_bytes(inode, number); -+ return QUOTA_OK; -+} -+ -+static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number) -+{ -+ return QUOTA_OK; -+} -+ -+static int vzquota_free_space2(struct inode *inode, qsize_t number) -+{ -+ inode_sub_bytes(inode, number); -+ return QUOTA_OK; -+} -+ -+static int vzquota_free_inode2(const struct inode *inode, unsigned long number) -+{ -+ return QUOTA_OK; -+} -+ -+static int vzquota_transfer2(struct inode *inode, struct iattr *iattr) -+{ -+ return QUOTA_OK; -+} -+ -+struct dquot_operations vz_quota_operations2 = { -+ initialize: vzquota_initialize2, -+ drop: vzquota_drop2, -+ alloc_space: vzquota_alloc_space2, -+ alloc_inode: vzquota_alloc_inode2, -+ free_space: vzquota_free_space2, -+ free_inode: vzquota_free_inode2, -+ transfer: vzquota_transfer2 -+}; -+ -+static int vz_quota_on(struct super_block *sb, int type, -+ int format_id, char *path) -+{ -+ struct vz_quota_master *qmblk; -+ int mask, mask2; -+ int err; -+ -+ qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); -+ err = -ESRCH; -+ if (qmblk == NULL) -+ goto out; -+ err = -EIO; -+ if (qmblk == VZ_QUOTA_BAD) -+ goto out; -+ -+ mask = 0; -+ mask2 = 0; -+ sb->dq_op = &vz_quota_operations2; -+ sb->s_qcop = &vz_quotactl_operations; -+ if (type == USRQUOTA) { -+ mask = DQUOT_USR_ENABLED; -+ mask2 = VZDQ_USRQUOTA; -+ } -+ if (type == GRPQUOTA) { -+ mask = DQUOT_GRP_ENABLED; -+ mask2 = VZDQ_GRPQUOTA; -+ } -+ err = -EBUSY; -+ if (qmblk->dq_flags & mask2) -+ goto out; -+ -+ err = 0; -+ qmblk->dq_flags |= mask2; -+ sb->s_dquot.flags |= mask; -+ -+out: -+ up(&vz_quota_sem); -+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) -+ qmblk_put(qmblk); -+ return err; -+} -+ -+static int vz_quota_off(struct super_block *sb, int type) -+{ -+ struct vz_quota_master *qmblk; -+ int mask2; -+ int err; -+ -+ qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); -+ err = -ESRCH; -+ if (qmblk == NULL) -+ goto out; -+ err = -EIO; -+ if (qmblk == VZ_QUOTA_BAD) -+ goto out; -+ -+ mask2 = 0; -+ if (type == USRQUOTA) -+ mask2 = VZDQ_USRQUOTA; -+ if (type == GRPQUOTA) -+ mask2 = VZDQ_GRPQUOTA; -+ err = -EINVAL; -+ if (!(qmblk->dq_flags & mask2)) -+ goto out; -+ -+ qmblk->dq_flags &= ~mask2; -+ err = 0; -+ -+out: -+ up(&vz_quota_sem); -+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) -+ qmblk_put(qmblk); -+ return err; -+} -+ -+static int vz_quota_sync(struct super_block *sb, int type) -+{ -+ return 0; /* vz quota is always uptodate */ -+} -+ -+static int vz_get_dqblk(struct super_block *sb, int type, -+ qid_t id, struct if_dqblk *di) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ugid *ugid; -+ int err; -+ -+ qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); -+ err = -ESRCH; -+ if (qmblk == NULL) -+ goto out; -+ err = -EIO; -+ if (qmblk == VZ_QUOTA_BAD) -+ goto out; -+ -+ err = 0; -+ ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC); -+ if (ugid != VZ_QUOTA_UGBAD) { -+ qmblk_data_read_lock(qmblk); -+ di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10; -+ di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10; -+ di->dqb_curspace = ugid->qugid_stat.bcurrent; -+ di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit; -+ di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit; -+ di->dqb_curinodes = ugid->qugid_stat.icurrent; -+ di->dqb_btime = ugid->qugid_stat.btime; -+ di->dqb_itime = ugid->qugid_stat.itime; -+ qmblk_data_read_unlock(qmblk); -+ di->dqb_valid = QIF_ALL; -+ vzquota_put_ugid(qmblk, ugid); -+ } else { -+ memset(di, 0, sizeof(*di)); -+ di->dqb_valid = QIF_ALL; -+ } -+ -+out: -+ up(&vz_quota_sem); -+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) -+ qmblk_put(qmblk); -+ return err; -+} -+ -+/* must be called under vz_quota_sem */ -+static int __vz_set_dqblk(struct vz_quota_master *qmblk, -+ int type, qid_t id, struct if_dqblk *di) -+{ -+ struct vz_quota_ugid *ugid; -+ -+ ugid = vzquota_find_ugid(qmblk, id, type, 0); -+ if (ugid == VZ_QUOTA_UGBAD) -+ return -ESRCH; -+ -+ qmblk_data_write_lock(qmblk); -+ /* -+ * Subtle compatibility breakage. -+ * -+ * Some old non-vz kernel quota didn't start grace period -+ * if the new soft limit happens to be below the usage. -+ * Non-vz kernel quota in 2.4.20 starts the grace period -+ * (if it hasn't been started). -+ * Current non-vz kernel performs even more complicated -+ * manipulations... -+ * -+ * Also, current non-vz kernels have inconsistency related to -+ * the grace time start. In regular operations the grace period -+ * is started if the usage is greater than the soft limit (and, -+ * strangely, is cancelled if the usage is less). -+ * However, set_dqblk starts the grace period if the usage is greater -+ * or equal to the soft limit. -+ * -+ * Here we try to mimic the behavior of the current non-vz kernel. -+ */ -+ if (di->dqb_valid & QIF_BLIMITS) { -+ ugid->qugid_stat.bhardlimit = -+ (__u64)di->dqb_bhardlimit << 10; -+ ugid->qugid_stat.bsoftlimit = -+ (__u64)di->dqb_bsoftlimit << 10; -+ if (di->dqb_bsoftlimit == 0 || -+ ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit) -+ ugid->qugid_stat.btime = 0; -+ else if (!(di->dqb_valid & QIF_BTIME)) -+ ugid->qugid_stat.btime = CURRENT_TIME_SECONDS -+ + qmblk->dq_ugid_info[type].bexpire; -+ else -+ ugid->qugid_stat.btime = di->dqb_btime; -+ } -+ if (di->dqb_valid & QIF_ILIMITS) { -+ ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit; -+ ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit; -+ if (di->dqb_isoftlimit == 0 || -+ ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit) -+ ugid->qugid_stat.itime = 0; -+ else if (!(di->dqb_valid & QIF_ITIME)) -+ ugid->qugid_stat.itime = CURRENT_TIME_SECONDS -+ + qmblk->dq_ugid_info[type].iexpire; -+ else -+ ugid->qugid_stat.itime = di->dqb_itime; -+ } -+ qmblk_data_write_unlock(qmblk); -+ vzquota_put_ugid(qmblk, ugid); -+ -+ return 0; -+} -+ -+static int vz_set_dqblk(struct super_block *sb, int type, -+ qid_t id, struct if_dqblk *di) -+{ -+ struct vz_quota_master *qmblk; -+ int err; -+ -+ qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); -+ err = -ESRCH; -+ if (qmblk == NULL) -+ goto out; -+ err = -EIO; -+ if (qmblk == VZ_QUOTA_BAD) -+ goto out; -+ err = __vz_set_dqblk(qmblk, type, id, di); -+out: -+ up(&vz_quota_sem); -+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) -+ qmblk_put(qmblk); -+ return err; -+} -+ -+static int vz_get_dqinfo(struct super_block *sb, int type, -+ struct if_dqinfo *ii) -+{ -+ struct vz_quota_master *qmblk; -+ int err; -+ -+ qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); -+ err = -ESRCH; -+ if (qmblk == NULL) -+ goto out; -+ err = -EIO; -+ if (qmblk == VZ_QUOTA_BAD) -+ goto out; -+ -+ err = 0; -+ ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire; -+ ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire; -+ ii->dqi_flags = 0; -+ ii->dqi_valid = IIF_ALL; -+ -+out: -+ up(&vz_quota_sem); -+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) -+ qmblk_put(qmblk); -+ return err; -+} -+ -+/* must be called under vz_quota_sem */ -+static int __vz_set_dqinfo(struct vz_quota_master *qmblk, -+ int type, struct if_dqinfo *ii) -+{ -+ if (ii->dqi_valid & IIF_FLAGS) -+ if (ii->dqi_flags & DQF_MASK) -+ return -EINVAL; -+ -+ if (ii->dqi_valid & IIF_BGRACE) -+ qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace; -+ if (ii->dqi_valid & IIF_IGRACE) -+ qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace; -+ return 0; -+} -+ -+static int vz_set_dqinfo(struct super_block *sb, int type, -+ struct if_dqinfo *ii) -+{ -+ struct vz_quota_master *qmblk; -+ int err; -+ -+ qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); -+ err = -ESRCH; -+ if (qmblk == NULL) -+ goto out; -+ err = -EIO; -+ if (qmblk == VZ_QUOTA_BAD) -+ goto out; -+ err = __vz_set_dqinfo(qmblk, type, ii); -+out: -+ up(&vz_quota_sem); -+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) -+ qmblk_put(qmblk); -+ return err; -+} -+ -+#ifdef CONFIG_QUOTA_COMPAT -+ -+#define Q_GETQUOTI_SIZE 1024 -+ -+#define UGID2DQBLK(dst, src) \ -+ do { \ -+ (dst).dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \ -+ (dst).dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \ -+ (dst).dqb_curinodes = (src)->qugid_stat.icurrent; \ -+ /* in 1K blocks */ \ -+ (dst).dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \ -+ /* in 1K blocks */ \ -+ (dst).dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \ -+ /* in bytes, 64 bit */ \ -+ (dst).dqb_curspace = (src)->qugid_stat.bcurrent; \ -+ (dst).dqb_btime = (src)->qugid_stat.btime; \ -+ (dst).dqb_itime = (src)->qugid_stat.itime; \ -+ } while (0) -+ -+static int vz_get_quoti(struct super_block *sb, int type, qid_t idx, -+ struct v2_disk_dqblk *dqblk) -+{ -+ struct vz_quota_master *qmblk; -+ struct v2_disk_dqblk data; -+ struct vz_quota_ugid *ugid; -+ int count; -+ int err; -+ -+ qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); -+ err = -ESRCH; -+ if (qmblk == NULL) -+ goto out; -+ err = -EIO; -+ if (qmblk == VZ_QUOTA_BAD) -+ goto out; -+ -+ down(&qmblk->dq_sem); -+ for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; -+ ugid != NULL && count < Q_GETQUOTI_SIZE; -+ count++) -+ { -+ qmblk_data_read_lock(qmblk); -+ UGID2DQBLK(data, ugid); -+ qmblk_data_read_unlock(qmblk); -+ data.dqb_id = ugid->qugid_id; -+ if (copy_to_user(dqblk, &data, sizeof(data))) -+ goto fault; -+ dqblk++; -+ -+ /* Find next entry */ -+ ugid = vzquota_get_next(qmblk, ugid); -+ BUG_ON(ugid != NULL && ugid->qugid_type != type); -+ } -+ err = count; -+out_ugid: -+ up(&qmblk->dq_sem); -+out: -+ up(&vz_quota_sem); -+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) -+ qmblk_put(qmblk); -+ -+ return err; -+ -+fault: -+ err = count ? count : -EFAULT; -+ goto out_ugid; -+} -+ -+#endif -+ -+struct quotactl_ops vz_quotactl_operations = { -+ quota_on: vz_quota_on, -+ quota_off: vz_quota_off, -+ quota_sync: vz_quota_sync, -+ get_info: vz_get_dqinfo, -+ set_info: vz_set_dqinfo, -+ get_dqblk: vz_get_dqblk, -+ set_dqblk: vz_set_dqblk, -+#ifdef CONFIG_QUOTA_COMPAT -+ get_quoti: vz_get_quoti -+#endif -+}; -+ -+ -+/* ---------------------------------------------------------------------- -+ * Management interface for host system admins. -+ * --------------------------------------------------------------------- */ -+ -+static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size, -+ struct vz_quota_iface *u_ugid_buf) -+{ -+ struct vz_quota_master *qmblk; -+ int ret; -+ -+ down(&vz_quota_sem); -+ -+ ret = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ ret = -EBUSY; -+ if (qmblk->dq_state != VZDQ_STARTING) -+ goto out; /* working quota doesn't accept new ugids */ -+ -+ ret = 0; -+ /* start to add ugids */ -+ for (ret = 0; ret < ugid_size; ret++) { -+ struct vz_quota_iface ugid_buf; -+ struct vz_quota_ugid *ugid; -+ -+ if (copy_from_user(&ugid_buf, u_ugid_buf, sizeof(ugid_buf))) -+ break; -+ -+ if (ugid_buf.qi_type >= MAXQUOTAS) -+ break; /* bad quota type - this is the only check */ -+ -+ ugid = vzquota_find_ugid(qmblk, -+ ugid_buf.qi_id, ugid_buf.qi_type, 0); -+ if (ugid == VZ_QUOTA_UGBAD) { -+ qmblk->dq_flags |= VZDQUG_FIXED_SET; -+ break; /* limit reached */ -+ } -+ -+ /* update usage/limits -+ * we can copy the data without the lock, because the data -+ * cannot be modified in VZDQ_STARTING state */ -+ ugid->qugid_stat = ugid_buf.qi_stat; -+ -+ vzquota_put_ugid(qmblk, ugid); -+ -+ u_ugid_buf++; /* next user buffer */ -+ } -+out: -+ up(&vz_quota_sem); -+ -+ return ret; -+} -+ -+static int quota_ugid_setgrace(unsigned int quota_id, -+ struct dq_info u_dq_info[]) -+{ -+ struct vz_quota_master *qmblk; -+ struct dq_info dq_info[MAXQUOTAS]; -+ struct dq_info *target; -+ int err, type; -+ -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = -EBUSY; -+ if (qmblk->dq_state != VZDQ_STARTING) -+ goto out; /* working quota doesn't accept changing options */ -+ -+ err = -EFAULT; -+ if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info))) -+ goto out; -+ -+ err = 0; -+ -+ /* update in qmblk */ -+ for (type = 0; type < MAXQUOTAS; type ++) { -+ target = &qmblk->dq_ugid_info[type]; -+ target->bexpire = dq_info[type].bexpire; -+ target->iexpire = dq_info[type].iexpire; -+ } -+out: -+ up(&vz_quota_sem); -+ -+ return err; -+} -+ -+static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size, -+ struct vz_quota_iface *u_ugid_buf) -+{ -+ int type, count; -+ struct vz_quota_ugid *ugid; -+ -+ if (QTREE_LEAFNUM(qmblk->dq_uid_tree) + -+ QTREE_LEAFNUM(qmblk->dq_gid_tree) -+ <= index) -+ return 0; -+ -+ count = 0; -+ -+ type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA; -+ if (type == GRPQUOTA) -+ index -= QTREE_LEAFNUM(qmblk->dq_uid_tree); -+ -+ /* loop through ugid and then qgid quota */ -+repeat: -+ for (ugid = vzquota_get_byindex(qmblk, index, type); -+ ugid != NULL && count < size; -+ ugid = vzquota_get_next(qmblk, ugid), count++) -+ { -+ struct vz_quota_iface ugid_buf; -+ -+ /* form interface buffer and send in to user-level */ -+ qmblk_data_read_lock(qmblk); -+ memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat, -+ sizeof(ugid_buf.qi_stat)); -+ qmblk_data_read_unlock(qmblk); -+ ugid_buf.qi_id = ugid->qugid_id; -+ ugid_buf.qi_type = ugid->qugid_type; -+ -+ if (copy_to_user(u_ugid_buf, &ugid_buf, sizeof(ugid_buf))) -+ goto fault; -+ u_ugid_buf++; /* next portion of user buffer */ -+ } -+ -+ if (type == USRQUOTA && count < size) { -+ type = GRPQUOTA; -+ index = 0; -+ goto repeat; -+ } -+ -+ return count; -+ -+fault: -+ return count ? count : -EFAULT; -+} -+ -+static int quota_ugid_getstat(unsigned int quota_id, -+ int index, int size, struct vz_quota_iface *u_ugid_buf) -+{ -+ struct vz_quota_master *qmblk; -+ int err; -+ -+ if (index < 0 || size < 0) -+ return -EINVAL; -+ -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ down(&qmblk->dq_sem); -+ err = do_quota_ugid_getstat(qmblk, index, size, u_ugid_buf); -+ up(&qmblk->dq_sem); -+ -+out: -+ up(&vz_quota_sem); -+ return err; -+} -+ -+static int quota_ugid_getgrace(unsigned int quota_id, -+ struct dq_info u_dq_info[]) -+{ -+ struct vz_quota_master *qmblk; -+ struct dq_info dq_info[MAXQUOTAS]; -+ struct dq_info *target; -+ int err, type; -+ -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = 0; -+ /* update from qmblk */ -+ for (type = 0; type < MAXQUOTAS; type ++) { -+ target = &qmblk->dq_ugid_info[type]; -+ dq_info[type].bexpire = target->bexpire; -+ dq_info[type].iexpire = target->iexpire; -+ dq_info[type].flags = target->flags; -+ } -+ -+ if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info))) -+ err = -EFAULT; -+out: -+ up(&vz_quota_sem); -+ -+ return err; -+} -+ -+static int quota_ugid_getconfig(unsigned int quota_id, -+ struct vz_quota_ugid_stat *info) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ugid_stat kinfo; -+ int err; -+ -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = 0; -+ kinfo.limit = qmblk->dq_ugid_max; -+ kinfo.count = qmblk->dq_ugid_count; -+ kinfo.flags = qmblk->dq_flags; -+ -+ if (copy_to_user(info, &kinfo, sizeof(kinfo))) -+ err = -EFAULT; -+out: -+ up(&vz_quota_sem); -+ -+ return err; -+} -+ -+static int quota_ugid_setconfig(unsigned int quota_id, -+ struct vz_quota_ugid_stat *info) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ugid_stat kinfo; -+ int err; -+ -+ down(&vz_quota_sem); -+ -+ err = -ENOENT; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = -EFAULT; -+ if (copy_from_user(&kinfo, info, sizeof(kinfo))) -+ goto out; -+ -+ err = 0; -+ qmblk->dq_ugid_max = kinfo.limit; -+ if (qmblk->dq_state == VZDQ_STARTING) { -+ qmblk->dq_flags = kinfo.flags; -+ if (qmblk->dq_flags & VZDQUG_ON) -+ qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA; -+ } -+ -+out: -+ up(&vz_quota_sem); -+ -+ return err; -+} -+ -+static int quota_ugid_setlimit(unsigned int quota_id, -+ struct vz_quota_ugid_setlimit *u_lim) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ugid_setlimit lim; -+ int err; -+ -+ down(&vz_quota_sem); -+ -+ err = -ESRCH; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = -EFAULT; -+ if (copy_from_user(&lim, u_lim, sizeof(lim))) -+ goto out; -+ -+ err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); -+ -+out: -+ up(&vz_quota_sem); -+ -+ return err; -+} -+ -+static int quota_ugid_setinfo(unsigned int quota_id, -+ struct vz_quota_ugid_setinfo *u_info) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ugid_setinfo info; -+ int err; -+ -+ down(&vz_quota_sem); -+ -+ err = -ESRCH; -+ qmblk = vzquota_find_master(quota_id); -+ if (qmblk == NULL) -+ goto out; -+ -+ err = -EFAULT; -+ if (copy_from_user(&info, u_info, sizeof(info))) -+ goto out; -+ -+ err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); -+ -+out: -+ up(&vz_quota_sem); -+ -+ return err; -+} -+ -+/* -+ * This is a system call to maintain UGID quotas -+ * Note this call is allowed to run ONLY from VE0 -+ */ -+long do_vzquotaugidctl(struct vzctl_quotaugidctl *qub) -+{ -+ int ret; -+ -+ ret = -EPERM; -+ /* access allowed only from root of VE0 */ -+ if (!capable(CAP_SYS_RESOURCE) || -+ !capable(CAP_SYS_ADMIN)) -+ goto out; -+ -+ switch (qub->cmd) { -+ case VZ_DQ_UGID_GETSTAT: -+ ret = quota_ugid_getstat(qub->quota_id, -+ qub->ugid_index, qub->ugid_size, -+ (struct vz_quota_iface *)qub->addr); -+ break; -+ case VZ_DQ_UGID_ADDSTAT: -+ ret = quota_ugid_addstat(qub->quota_id, qub->ugid_size, -+ (struct vz_quota_iface *)qub->addr); -+ break; -+ case VZ_DQ_UGID_GETGRACE: -+ ret = quota_ugid_getgrace(qub->quota_id, -+ (struct dq_info *)qub->addr); -+ break; -+ case VZ_DQ_UGID_SETGRACE: -+ ret = quota_ugid_setgrace(qub->quota_id, -+ (struct dq_info *)qub->addr); -+ break; -+ case VZ_DQ_UGID_GETCONFIG: -+ ret = quota_ugid_getconfig(qub->quota_id, -+ (struct vz_quota_ugid_stat *)qub->addr); -+ break; -+ case VZ_DQ_UGID_SETCONFIG: -+ ret = quota_ugid_setconfig(qub->quota_id, -+ (struct vz_quota_ugid_stat *)qub->addr); -+ break; -+ case VZ_DQ_UGID_SETLIMIT: -+ ret = quota_ugid_setlimit(qub->quota_id, -+ (struct vz_quota_ugid_setlimit *) -+ qub->addr); -+ break; -+ case VZ_DQ_UGID_SETINFO: -+ ret = quota_ugid_setinfo(qub->quota_id, -+ (struct vz_quota_ugid_setinfo *) -+ qub->addr); -+ break; -+ default: -+ ret = -EINVAL; -+ goto out; -+ } -+out: -+ return ret; -+} -+ -+static void ugid_quota_on_sb(struct super_block *sb) -+{ -+ struct super_block *real_sb; -+ struct vz_quota_master *qmblk; -+ -+ if (!sb->s_op->get_quota_root) -+ return; -+ -+ real_sb = sb->s_op->get_quota_root(sb)->i_sb; -+ if (real_sb->dq_op != &vz_quota_operations) -+ return; -+ -+ sb->dq_op = &vz_quota_operations2; -+ sb->s_qcop = &vz_quotactl_operations; -+ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); -+ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); -+ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; -+ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; -+ -+ qmblk = vzquota_find_qmblk(sb); -+ if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) -+ return; -+ down(&vz_quota_sem); -+ if (qmblk->dq_flags & VZDQ_USRQUOTA) -+ sb->s_dquot.flags |= DQUOT_USR_ENABLED; -+ if (qmblk->dq_flags & VZDQ_GRPQUOTA) -+ sb->s_dquot.flags |= DQUOT_GRP_ENABLED; -+ up(&vz_quota_sem); -+ qmblk_put(qmblk); -+} -+ -+static void ugid_quota_off_sb(struct super_block *sb) -+{ -+ /* can't make quota off on mounted super block */ -+ BUG_ON(sb->s_root != NULL); -+} -+ -+static int ugid_notifier_call(struct vnotifier_block *self, -+ unsigned long n, void *data, int old_ret) -+{ -+ struct virt_info_quota *viq; -+ -+ viq = (struct virt_info_quota *)data; -+ -+ switch (n) { -+ case VIRTINFO_QUOTA_ON: -+ ugid_quota_on_sb(viq->super); -+ break; -+ case VIRTINFO_QUOTA_OFF: -+ ugid_quota_off_sb(viq->super); -+ break; -+ case VIRTINFO_QUOTA_GETSTAT: -+ break; -+ default: -+ return old_ret; -+ } -+ return NOTIFY_OK; -+} -+ -+static struct vnotifier_block ugid_notifier_block = { -+ .notifier_call = ugid_notifier_call, -+}; -+ -+/* ---------------------------------------------------------------------- -+ * Init/exit. -+ * --------------------------------------------------------------------- */ -+ -+struct quota_format_type vz_quota_empty_v2_format = { -+ qf_fmt_id: QFMT_VFS_V0, -+ qf_ops: NULL, -+ qf_owner: THIS_MODULE -+}; -+ -+int vzquota_ugid_init() -+{ -+ int err; -+ -+ vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid", -+ sizeof(struct vz_quota_ugid), -+ 0, SLAB_HWCACHE_ALIGN, -+ NULL, NULL); -+ if (vz_quota_ugid_cachep == NULL) -+ goto err_slab; -+ -+ err = register_quota_format(&vz_quota_empty_v2_format); -+ if (err) -+ goto err_reg; -+ -+ virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block); -+ return 0; -+ -+err_reg: -+ kmem_cache_destroy(vz_quota_ugid_cachep); -+ return err; -+ -+err_slab: -+ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); -+ return -ENOMEM; -+} -+ -+void vzquota_ugid_release() -+{ -+ virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block); -+ unregister_quota_format(&vz_quota_empty_v2_format); -+ -+ if (kmem_cache_destroy(vz_quota_ugid_cachep)) -+ printk(KERN_ERR "VZQUOTA: kmem_cache_destroy failed\n"); -+} -diff -upr linux-2.6.16.orig/fs/vzdquot.c linux-2.6.16-026test009/fs/vzdquot.c ---- linux-2.6.16.orig/fs/vzdquot.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/fs/vzdquot.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,1705 @@ -+/* -+ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * This file contains the core of Virtuozzo disk quota implementation: -+ * maintenance of VZDQ information in inodes, -+ * external interfaces, -+ * module entry. -+ */ -+ -+#include <linux/config.h> -+#include <linux/kernel.h> -+#include <linux/string.h> -+#include <linux/list.h> -+#include <asm/atomic.h> -+#include <linux/spinlock.h> -+#include <asm/semaphore.h> -+#include <linux/slab.h> -+#include <linux/fs.h> -+#include <linux/dcache.h> -+#include <linux/quota.h> -+#include <linux/rcupdate.h> -+#include <linux/module.h> -+#include <asm/uaccess.h> -+#include <linux/vzctl.h> -+#include <linux/vzctl_quota.h> -+#include <linux/vzquota.h> -+#include <linux/virtinfo.h> -+#include <linux/vzdq_tree.h> -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Locking -+ * -+ * ---------------------------------------------------------------------- */ -+ -+/* -+ * Serializes on/off and all other do_vzquotactl operations. -+ * Protects qmblk hash. -+ */ -+struct semaphore vz_quota_sem; -+ -+/* -+ * Data access locks -+ * inode_qmblk -+ * protects qmblk pointers in all inodes and qlnk content in general -+ * (but not qmblk content); -+ * also protects related qmblk invalidation procedures; -+ * can't be per-inode because of vzquota_dtree_qmblk complications -+ * and problems with serialization with quota_on, -+ * but can be per-superblock; -+ * qmblk_data -+ * protects qmblk fields (such as current usage) -+ * quota_data -+ * protects charge/uncharge operations, thus, implies -+ * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock -+ * (to protect ugid pointers). -+ * -+ * Lock order: -+ * inode_qmblk_lock -> dcache_lock -+ * inode_qmblk_lock -> qmblk_data -+ */ -+static spinlock_t vzdq_qmblk_lock = SPIN_LOCK_UNLOCKED; -+ -+inline void inode_qmblk_lock(struct super_block *sb) -+{ -+ spin_lock(&vzdq_qmblk_lock); -+} -+ -+inline void inode_qmblk_unlock(struct super_block *sb) -+{ -+ spin_unlock(&vzdq_qmblk_lock); -+} -+ -+inline void qmblk_data_read_lock(struct vz_quota_master *qmblk) -+{ -+ spin_lock(&qmblk->dq_data_lock); -+} -+ -+inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk) -+{ -+ spin_unlock(&qmblk->dq_data_lock); -+} -+ -+inline void qmblk_data_write_lock(struct vz_quota_master *qmblk) -+{ -+ spin_lock(&qmblk->dq_data_lock); -+} -+ -+inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk) -+{ -+ spin_unlock(&qmblk->dq_data_lock); -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Master hash table handling. -+ * -+ * SMP not safe, serialied by vz_quota_sem within quota syscalls -+ * -+ * --------------------------------------------------------------------- */ -+ -+static kmem_cache_t *vzquota_cachep; -+ -+/* -+ * Hash function. -+ */ -+#define QHASH_BITS 6 -+#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS) -+#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1) -+ -+struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE]; -+int vzquota_hash_size = VZ_QUOTA_HASH_SIZE; -+ -+static inline int vzquota_hash_func(unsigned int qid) -+{ -+ return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK); -+} -+ -+/** -+ * vzquota_alloc_master - alloc and instantiate master quota record -+ * -+ * Returns: -+ * pointer to newly created record if SUCCESS -+ * -ENOMEM if out of memory -+ * -EEXIST if record with given quota_id already exist -+ */ -+struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, -+ struct vz_quota_stat *qstat) -+{ -+ int err; -+ struct vz_quota_master *qmblk; -+ -+ err = -EEXIST; -+ if (vzquota_find_master(quota_id) != NULL) -+ goto out; -+ -+ err = -ENOMEM; -+ qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL); -+ if (qmblk == NULL) -+ goto out; -+#ifdef CONFIG_VZ_QUOTA_UGID -+ qmblk->dq_uid_tree = quotatree_alloc(); -+ if (!qmblk->dq_uid_tree) -+ goto out_free; -+ -+ qmblk->dq_gid_tree = quotatree_alloc(); -+ if (!qmblk->dq_gid_tree) -+ goto out_free_tree; -+#endif -+ -+ qmblk->dq_state = VZDQ_STARTING; -+ init_MUTEX(&qmblk->dq_sem); -+ spin_lock_init(&qmblk->dq_data_lock); -+ -+ qmblk->dq_id = quota_id; -+ qmblk->dq_stat = qstat->dq_stat; -+ qmblk->dq_info = qstat->dq_info; -+ qmblk->dq_root_dentry = NULL; -+ qmblk->dq_root_mnt = NULL; -+ qmblk->dq_sb = NULL; -+ qmblk->dq_ugid_count = 0; -+ qmblk->dq_ugid_max = 0; -+ qmblk->dq_flags = 0; -+ memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info)); -+ INIT_LIST_HEAD(&qmblk->dq_ilink_list); -+ -+ atomic_set(&qmblk->dq_count, 1); -+ -+ /* insert in hash chain */ -+ list_add(&qmblk->dq_hash, -+ &vzquota_hash_table[vzquota_hash_func(quota_id)]); -+ -+ /* success */ -+ return qmblk; -+ -+out_free_tree: -+ quotatree_free(qmblk->dq_uid_tree, NULL); -+out_free: -+ kmem_cache_free(vzquota_cachep, qmblk); -+out: -+ return ERR_PTR(err); -+} -+ -+static struct vz_quota_master *vzquota_alloc_fake(void) -+{ -+ struct vz_quota_master *qmblk; -+ -+ qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL); -+ if (qmblk == NULL) -+ return NULL; -+ memset(qmblk, 0, sizeof(*qmblk)); -+ qmblk->dq_state = VZDQ_STOPING; -+ qmblk->dq_flags = VZDQ_NOQUOT; -+ spin_lock_init(&qmblk->dq_data_lock); -+ INIT_LIST_HEAD(&qmblk->dq_ilink_list); -+ atomic_set(&qmblk->dq_count, 1); -+ return qmblk; -+} -+ -+/** -+ * vzquota_find_master - find master record with given id -+ * -+ * Returns qmblk without touching its refcounter. -+ * Called under vz_quota_sem. -+ */ -+struct vz_quota_master *vzquota_find_master(unsigned int quota_id) -+{ -+ int i; -+ struct vz_quota_master *qp; -+ -+ i = vzquota_hash_func(quota_id); -+ list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) { -+ if (qp->dq_id == quota_id) -+ return qp; -+ } -+ return NULL; -+} -+ -+/** -+ * vzquota_free_master - release resources taken by qmblk, freeing memory -+ * -+ * qmblk is assumed to be already taken out from the hash. -+ * Should be called outside vz_quota_sem. -+ */ -+void vzquota_free_master(struct vz_quota_master *qmblk) -+{ -+#ifdef CONFIG_VZ_QUOTA_UGID -+ vzquota_kill_ugid(qmblk); -+#endif -+ BUG_ON(!list_empty(&qmblk->dq_ilink_list)); -+ kmem_cache_free(vzquota_cachep, qmblk); -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Passing quota information through current -+ * -+ * Used in inode -> qmblk lookup at inode creation stage (since at that -+ * time there are no links between the inode being created and its parent -+ * directory). -+ * -+ * --------------------------------------------------------------------- */ -+ -+#define VZDQ_CUR_MAGIC 0x57d0fee2 -+ -+static inline int vzquota_cur_qmblk_check(void) -+{ -+ return current->magic == VZDQ_CUR_MAGIC; -+} -+ -+static inline struct inode *vzquota_cur_qmblk_fetch(void) -+{ -+ return current->ino; -+} -+ -+static inline void vzquota_cur_qmblk_set(struct inode *data) -+{ -+ struct task_struct *tsk; -+ -+ tsk = current; -+ tsk->magic = VZDQ_CUR_MAGIC; -+ tsk->ino = data; -+} -+ -+#if 0 -+static inline void vzquota_cur_qmblk_reset(void) -+{ -+ current->magic = 0; -+} -+#endif -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Superblock quota operations -+ * -+ * --------------------------------------------------------------------- */ -+ -+/* -+ * Kernel structure abuse. -+ * We use files[0] pointer as an int variable: -+ * reference counter of how many quota blocks uses this superblock. -+ * files[1] is used for generations structure which helps us to track -+ * when traversing of dentries is really required. -+ */ -+#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master -+#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\ -+ &sb->s_dquot.dqio_sem) -+ -+#if defined(VZ_QUOTA_UNLOAD) -+ -+#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count -+ -+struct dquot_operations *orig_dq_op; -+struct quotactl_ops *orig_dq_cop; -+ -+/** -+ * quota_get_super - account for new a quoted tree under the superblock -+ * -+ * One superblock can have multiple directory subtrees with different VZ -+ * quotas. We keep a counter of such subtrees and set VZ quota operations or -+ * reset the default ones. -+ * -+ * Called under vz_quota_sem (from quota_on). -+ */ -+int vzquota_get_super(struct super_block *sb) -+{ -+ if (sb->dq_op != &vz_quota_operations) { -+ down(&sb->s_dquot.dqonoff_sem); -+ if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) { -+ up(&sb->s_dquot.dqonoff_sem); -+ return -EEXIST; -+ } -+ if (orig_dq_op == NULL && sb->dq_op != NULL) -+ orig_dq_op = sb->dq_op; -+ sb->dq_op = &vz_quota_operations; -+ if (orig_dq_cop == NULL && sb->s_qcop != NULL) -+ orig_dq_cop = sb->s_qcop; -+ /* XXX this may race with sys_quotactl */ -+#ifdef CONFIG_VZ_QUOTA_UGID -+ sb->s_qcop = &vz_quotactl_operations; -+#else -+ sb->s_qcop = NULL; -+#endif -+ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); -+ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); -+ -+ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); -+ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); -+ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; -+ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; -+ /* -+ * To get quotaops.h call us we need to mark superblock -+ * as having quota. These flags mark the moment when -+ * our dq_op start to be called. -+ * -+ * The ordering of dq_op and s_dquot.flags assignment -+ * needs to be enforced, but other CPUs do not do rmb() -+ * between s_dquot.flags and dq_op accesses. -+ */ -+ wmb(); synchronize_sched(); -+ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; -+ __module_get(THIS_MODULE); -+ up(&sb->s_dquot.dqonoff_sem); -+ } -+ /* protected by vz_quota_sem */ -+ __VZ_QUOTA_SBREF(sb)++; -+ return 0; -+} -+ -+/** -+ * quota_put_super - release superblock when one quota tree goes away -+ * -+ * Called under vz_quota_sem. -+ */ -+void vzquota_put_super(struct super_block *sb) -+{ -+ int count; -+ -+ count = --__VZ_QUOTA_SBREF(sb); -+ if (count == 0) { -+ down(&sb->s_dquot.dqonoff_sem); -+ sb->s_dquot.flags = 0; -+ wmb(); synchronize_sched(); -+ sema_init(&sb->s_dquot.dqio_sem, 1); -+ sb->s_qcop = orig_dq_cop; -+ sb->dq_op = orig_dq_op; -+ inode_qmblk_lock(sb); -+ quota_gen_put(SB_QGEN(sb)); -+ SB_QGEN(sb) = NULL; -+ /* release qlnk's without qmblk */ -+ remove_inode_quota_links_list(&non_vzquota_inodes_lh, -+ sb, NULL); -+ /* -+ * Races with quota initialization: -+ * after this inode_qmblk_unlock all inode's generations are -+ * invalidated, quota_inode_qmblk checks superblock operations. -+ */ -+ inode_qmblk_unlock(sb); -+ /* -+ * Module refcounting: in theory, this is the best place -+ * to call module_put(THIS_MODULE). -+ * In reality, it can't be done because we can't be sure that -+ * other CPUs do not enter our code segment through dq_op -+ * cached long time ago. Quotaops interface isn't supposed to -+ * go into modules currently (that is, into unloadable -+ * modules). By omitting module_put, our module isn't -+ * unloadable. -+ */ -+ up(&sb->s_dquot.dqonoff_sem); -+ } -+} -+ -+#else -+ -+struct vzquota_new_sop { -+ struct super_operations new_op; -+ struct super_operations *old_op; -+}; -+ -+/** -+ * vzquota_shutdown_super - callback on umount -+ */ -+void vzquota_shutdown_super(struct super_block *sb) -+{ -+ struct vz_quota_master *qmblk; -+ struct vzquota_new_sop *sop; -+ -+ qmblk = __VZ_QUOTA_NOQUOTA(sb); -+ __VZ_QUOTA_NOQUOTA(sb) = NULL; -+ if (qmblk != NULL) -+ qmblk_put(qmblk); -+ sop = container_of(sb->s_op, struct vzquota_new_sop, new_op); -+ sb->s_op = sop->old_op; -+ kfree(sop); -+ (*sb->s_op->put_super)(sb); -+} -+ -+/** -+ * vzquota_get_super - account for new a quoted tree under the superblock -+ * -+ * One superblock can have multiple directory subtrees with different VZ -+ * quotas. -+ * -+ * Called under vz_quota_sem (from vzquota_on). -+ */ -+int vzquota_get_super(struct super_block *sb) -+{ -+ struct vz_quota_master *qnew; -+ struct vzquota_new_sop *sop; -+ int err; -+ -+ down(&sb->s_dquot.dqonoff_sem); -+ err = -EEXIST; -+ if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) && -+ sb->dq_op != &vz_quota_operations) -+ goto out_up; -+ -+ /* -+ * This allocation code should be under sb->dq_op check below, but -+ * it doesn't really matter... -+ */ -+ if (__VZ_QUOTA_NOQUOTA(sb) == NULL) { -+ qnew = vzquota_alloc_fake(); -+ if (qnew == NULL) -+ goto out_up; -+ __VZ_QUOTA_NOQUOTA(sb) = qnew; -+ } -+ -+ if (sb->dq_op != &vz_quota_operations) { -+ sop = kmalloc(sizeof(*sop), GFP_KERNEL); -+ if (sop == NULL) { -+ vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb)); -+ __VZ_QUOTA_NOQUOTA(sb) = NULL; -+ goto out_up; -+ } -+ memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op)); -+ sop->new_op.put_super = &vzquota_shutdown_super; -+ sop->old_op = sb->s_op; -+ sb->s_op = &sop->new_op; -+ -+ sb->dq_op = &vz_quota_operations; -+#ifdef CONFIG_VZ_QUOTA_UGID -+ sb->s_qcop = &vz_quotactl_operations; -+#else -+ sb->s_qcop = NULL; -+#endif -+ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); -+ -+ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); -+ /* these 2 list heads are checked in sync_dquots() */ -+ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); -+ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); -+ sb->s_dquot.info[USRQUOTA].dqi_format = -+ &vz_quota_empty_v2_format; -+ sb->s_dquot.info[GRPQUOTA].dqi_format = -+ &vz_quota_empty_v2_format; -+ -+ /* -+ * To get quotaops.h to call us we need to mark superblock -+ * as having quota. These flags mark the moment when -+ * our dq_op start to be called. -+ * -+ * The ordering of dq_op and s_dquot.flags assignment -+ * needs to be enforced, but other CPUs do not do rmb() -+ * between s_dquot.flags and dq_op accesses. -+ */ -+ wmb(); synchronize_sched(); -+ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; -+ } -+ err = 0; -+ -+out_up: -+ up(&sb->s_dquot.dqonoff_sem); -+ return err; -+} -+ -+/** -+ * vzquota_put_super - one quota tree less on this superblock -+ * -+ * Called under vz_quota_sem. -+ */ -+void vzquota_put_super(struct super_block *sb) -+{ -+ /* -+ * Even if this put is the last one, -+ * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop -+ * won't be called and the remaining qmblk references won't be put. -+ */ -+} -+ -+#endif -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Helpers for inode -> qmblk link maintenance -+ * -+ * --------------------------------------------------------------------- */ -+ -+#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd) -+#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT) -+#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops) -+extern struct inode_operations vfs_empty_iops; -+ -+static int VZ_QUOTA_IS_ACTUAL(struct inode *inode) -+{ -+ struct vz_quota_master *qmblk; -+ -+ qmblk = INODE_QLNK(inode)->qmblk; -+ if (qmblk == VZ_QUOTA_BAD) -+ return 1; -+ if (qmblk == __VZ_QUOTA_EMPTY) -+ return 0; -+ if (qmblk->dq_flags & VZDQ_NOACT) -+ /* not actual (invalidated) qmblk */ -+ return 0; -+ return 1; -+} -+ -+static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk) -+{ -+ return qlnk->qmblk == __VZ_QUOTA_EMPTY; -+} -+ -+static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk) -+{ -+ qlnk->qmblk = __VZ_QUOTA_EMPTY; -+ qlnk->origin = VZ_QUOTAO_SETE; -+} -+ -+void vzquota_qlnk_init(struct vz_quota_ilink *qlnk) -+{ -+ memset(qlnk, 0, sizeof(*qlnk)); -+ INIT_LIST_HEAD(&qlnk->list); -+ vzquota_qlnk_set_empty(qlnk); -+ qlnk->origin = VZ_QUOTAO_INIT; -+} -+ -+void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk) -+{ -+ might_sleep(); -+ if (vzquota_qlnk_is_empty(qlnk)) -+ return; -+#if defined(CONFIG_VZ_QUOTA_UGID) -+ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) { -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ugid *quid, *qgid; -+ qmblk = qlnk->qmblk; -+ quid = qlnk->qugid[USRQUOTA]; -+ qgid = qlnk->qugid[GRPQUOTA]; -+ if (quid != NULL || qgid != NULL) { -+ down(&qmblk->dq_sem); -+ if (qgid != NULL) -+ vzquota_put_ugid(qmblk, qgid); -+ if (quid != NULL) -+ vzquota_put_ugid(qmblk, quid); -+ up(&qmblk->dq_sem); -+ } -+ } -+#endif -+ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) -+ qmblk_put(qlnk->qmblk); -+ qlnk->origin = VZ_QUOTAO_DESTR; -+} -+ -+/** -+ * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents -+ * @qlt: temporary -+ * @qli: inode's -+ * -+ * Locking is provided by the caller (depending on the context). -+ * After swap, @qli is inserted into the corresponding dq_ilink_list, -+ * @qlt list is reinitialized. -+ */ -+static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt, -+ struct vz_quota_ilink *qli) -+{ -+ struct vz_quota_master *qb; -+ struct vz_quota_ugid *qu; -+ int i; -+ -+ qb = qlt->qmblk; -+ qlt->qmblk = qli->qmblk; -+ qli->qmblk = qb; -+ list_del_init(&qli->list); -+ if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD) -+ list_add(&qli->list, &qb->dq_ilink_list); -+ INIT_LIST_HEAD(&qlt->list); -+ qli->origin = VZ_QUOTAO_SWAP; -+ -+ for (i = 0; i < MAXQUOTAS; i++) { -+ qu = qlt->qugid[i]; -+ qlt->qugid[i] = qli->qugid[i]; -+ qli->qugid[i] = qu; -+ } -+} -+ -+/** -+ * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks -+ * -+ * Called under dcache_lock and inode_qmblk locks. -+ * Returns 1 if locks were dropped inside, 0 if atomic. -+ */ -+static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk, -+ struct inode *inode) -+{ -+ if (vzquota_qlnk_is_empty(qlnk)) -+ return 0; -+ if (qlnk->qmblk == VZ_QUOTA_BAD) { -+ vzquota_qlnk_set_empty(qlnk); -+ return 0; -+ } -+ spin_unlock(&dcache_lock); -+ inode_qmblk_unlock(inode->i_sb); -+ vzquota_qlnk_destroy(qlnk); -+ vzquota_qlnk_init(qlnk); -+ inode_qmblk_lock(inode->i_sb); -+ spin_lock(&dcache_lock); -+ return 1; -+} -+ -+#if defined(CONFIG_VZ_QUOTA_UGID) -+/** -+ * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content -+ * -+ * Similar to vzquota_qlnk_reinit_locked, called under different locks. -+ */ -+static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk, -+ struct inode *inode, -+ struct vz_quota_master *qmblk) -+{ -+ if (vzquota_qlnk_is_empty(qlnk)) -+ return 0; -+ /* may be optimized if qlnk->qugid all NULLs */ -+ qmblk_data_write_unlock(qmblk); -+ inode_qmblk_unlock(inode->i_sb); -+ vzquota_qlnk_destroy(qlnk); -+ vzquota_qlnk_init(qlnk); -+ inode_qmblk_lock(inode->i_sb); -+ qmblk_data_write_lock(qmblk); -+ return 1; -+} -+#endif -+ -+/** -+ * vzquota_qlnk_fill - fill vz_quota_ilink content -+ * @qlnk: vz_quota_ilink to fill -+ * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid) -+ * @qmblk: qmblk to which this @qlnk will belong -+ * -+ * Called under dcache_lock and inode_qmblk locks. -+ * Returns 1 if locks were dropped inside, 0 if atomic. -+ * @qlnk is expected to be empty. -+ */ -+static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk, -+ struct inode *inode, -+ struct vz_quota_master *qmblk) -+{ -+ if (qmblk != VZ_QUOTA_BAD) -+ qmblk_get(qmblk); -+ qlnk->qmblk = qmblk; -+ -+#if defined(CONFIG_VZ_QUOTA_UGID) -+ if (qmblk != VZ_QUOTA_BAD && -+ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && -+ (qmblk->dq_flags & VZDQUG_ON)) { -+ struct vz_quota_ugid *quid, *qgid; -+ -+ spin_unlock(&dcache_lock); -+ inode_qmblk_unlock(inode->i_sb); -+ -+ down(&qmblk->dq_sem); -+ quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); -+ qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); -+ up(&qmblk->dq_sem); -+ -+ inode_qmblk_lock(inode->i_sb); -+ spin_lock(&dcache_lock); -+ qlnk->qugid[USRQUOTA] = quid; -+ qlnk->qugid[GRPQUOTA] = qgid; -+ return 1; -+ } -+#endif -+ -+ return 0; -+} -+ -+#if defined(CONFIG_VZ_QUOTA_UGID) -+/** -+ * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid -+ * -+ * This function is a helper for vzquota_transfer, and differs from -+ * vzquota_qlnk_fill only by locking. -+ */ -+static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk, -+ struct inode *inode, -+ struct iattr *iattr, -+ int mask, -+ struct vz_quota_master *qmblk) -+{ -+ qmblk_get(qmblk); -+ qlnk->qmblk = qmblk; -+ -+ if (mask) { -+ struct vz_quota_ugid *quid, *qgid; -+ -+ quid = qgid = NULL; /* to make gcc happy */ -+ if (!(mask & (1 << USRQUOTA))) -+ quid = vzquota_get_ugid(INODE_QLNK(inode)-> -+ qugid[USRQUOTA]); -+ if (!(mask & (1 << GRPQUOTA))) -+ qgid = vzquota_get_ugid(INODE_QLNK(inode)-> -+ qugid[GRPQUOTA]); -+ -+ qmblk_data_write_unlock(qmblk); -+ inode_qmblk_unlock(inode->i_sb); -+ -+ down(&qmblk->dq_sem); -+ if (mask & (1 << USRQUOTA)) -+ quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, -+ USRQUOTA, 0); -+ if (mask & (1 << GRPQUOTA)) -+ qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, -+ GRPQUOTA, 0); -+ up(&qmblk->dq_sem); -+ -+ inode_qmblk_lock(inode->i_sb); -+ qmblk_data_write_lock(qmblk); -+ qlnk->qugid[USRQUOTA] = quid; -+ qlnk->qugid[GRPQUOTA] = qgid; -+ return 1; -+ } -+ -+ return 0; -+} -+#endif -+ -+/** -+ * __vzquota_inode_init - make sure inode's qlnk is initialized -+ * -+ * May be called if qlnk is already initialized, detects this situation itself. -+ * Called under inode_qmblk_lock. -+ */ -+static void __vzquota_inode_init(struct inode *inode, unsigned char origin) -+{ -+ if (inode->i_dquot[USRQUOTA] == NODQUOT) { -+ vzquota_qlnk_init(INODE_QLNK(inode)); -+ inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT; -+ } -+ INODE_QLNK(inode)->origin = origin; -+} -+ -+/** -+ * vzquota_inode_drop - destroy VZ quota information in the inode -+ * -+ * Inode must not be externally accessible or dirty. -+ */ -+static void vzquota_inode_drop(struct inode *inode) -+{ -+ struct vz_quota_ilink qlnk; -+ -+ vzquota_qlnk_init(&qlnk); -+ inode_qmblk_lock(inode->i_sb); -+ vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode)); -+ INODE_QLNK(inode)->origin = VZ_QUOTAO_DRCAL; -+ inode->i_dquot[USRQUOTA] = NODQUOT; -+ inode_qmblk_unlock(inode->i_sb); -+ vzquota_qlnk_destroy(&qlnk); -+} -+ -+/** -+ * vzquota_inode_qmblk_set - initialize inode's qlnk -+ * @inode: inode to be initialized -+ * @qmblk: quota master block to which this inode should belong (may be BAD) -+ * @qlnk: placeholder to store data to resolve locking issues -+ * -+ * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise. -+ * Called under dcache_lock and inode_qmblk locks. -+ * @qlnk will be destroyed in the caller chain. -+ * -+ * It is not mandatory to restart parent checks since quota on/off currently -+ * shrinks dentry tree and checks that there are not outside references. -+ * But if at some time that shink is removed, restarts will be required. -+ * Additionally, the restarts prevent inconsistencies if the dentry tree -+ * changes (inode is moved). This is not a big deal, but anyway... -+ */ -+static int vzquota_inode_qmblk_set(struct inode *inode, -+ struct vz_quota_master *qmblk, -+ struct vz_quota_ilink *qlnk) -+{ -+ if (qmblk == NULL) { -+ printk(KERN_ERR "VZDQ: NULL in set, " -+ "orig %u, dev %s, inode %lu, fs %s\n", -+ INODE_QLNK(inode)->origin, -+ inode->i_sb->s_id, inode->i_ino, -+ inode->i_sb->s_type->name); -+ printk(KERN_ERR "current %d (%s), VE %d\n", -+ current->pid, current->comm, -+ VEID(get_exec_env())); -+ dump_stack(); -+ qmblk = VZ_QUOTA_BAD; -+ } -+ while (1) { -+ if (vzquota_qlnk_is_empty(qlnk) && -+ vzquota_qlnk_fill(qlnk, inode, qmblk)) -+ return 1; -+ if (qlnk->qmblk == qmblk) -+ break; -+ if (vzquota_qlnk_reinit_locked(qlnk, inode)) -+ return 1; -+ } -+ vzquota_qlnk_swap(qlnk, INODE_QLNK(inode)); -+ INODE_QLNK(inode)->origin = VZ_QUOTAO_QSET; -+ return 0; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * vzquota_inode_qmblk (inode -> qmblk lookup) parts -+ * -+ * --------------------------------------------------------------------- */ -+ -+static int vzquota_dparents_check_attach(struct inode *inode) -+{ -+ if (!list_empty(&inode->i_dentry)) -+ return 0; -+ printk(KERN_ERR "VZDQ: no parent for " -+ "dev %s, inode %lu, fs %s\n", -+ inode->i_sb->s_id, -+ inode->i_ino, -+ inode->i_sb->s_type->name); -+ return -1; -+} -+ -+static struct inode *vzquota_dparents_check_actual(struct inode *inode) -+{ -+ struct dentry *de; -+ -+ list_for_each_entry(de, &inode->i_dentry, d_alias) { -+ if (de->d_parent == de) /* detached dentry, perhaps */ -+ continue; -+ /* first access to parent, make sure its qlnk initialized */ -+ __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT); -+ if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode)) -+ return de->d_parent->d_inode; -+ } -+ return NULL; -+} -+ -+static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode) -+{ -+ struct dentry *de; -+ struct vz_quota_master *qmblk; -+ -+ qmblk = NULL; -+ list_for_each_entry(de, &inode->i_dentry, d_alias) { -+ if (de->d_parent == de) /* detached dentry, perhaps */ -+ continue; -+ if (qmblk == NULL) { -+ qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk; -+ continue; -+ } -+ if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) { -+ printk(KERN_WARNING "VZDQ: multiple quotas for " -+ "dev %s, inode %lu, fs %s\n", -+ inode->i_sb->s_id, -+ inode->i_ino, -+ inode->i_sb->s_type->name); -+ qmblk = VZ_QUOTA_BAD; -+ break; -+ } -+ } -+ if (qmblk == NULL) { -+ printk(KERN_WARNING "VZDQ: not attached to tree, " -+ "dev %s, inode %lu, fs %s\n", -+ inode->i_sb->s_id, -+ inode->i_ino, -+ inode->i_sb->s_type->name); -+ qmblk = VZ_QUOTA_BAD; -+ } -+ return qmblk; -+} -+ -+static void vzquota_dbranch_actualize(struct inode *inode, -+ struct inode *refinode) -+{ -+ struct inode *pinode; -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ilink qlnk; -+ -+ vzquota_qlnk_init(&qlnk); -+ -+start: -+ if (inode == inode->i_sb->s_root->d_inode) { -+ /* filesystem root */ -+ atomic_inc(&inode->i_count); -+ do { -+ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); -+ } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk)); -+ goto out; -+ } -+ -+ if (!vzquota_dparents_check_attach(inode)) { -+ pinode = vzquota_dparents_check_actual(inode); -+ if (pinode != NULL) { -+ inode = pinode; -+ goto start; -+ } -+ } -+ -+ atomic_inc(&inode->i_count); -+ while (1) { -+ if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */ -+ break; -+ /* -+ * Need to check parents again if we have slept inside -+ * vzquota_inode_qmblk_set() in the loop. -+ * If the state of parents is different, just return and repeat -+ * the actualizing process again from the inode passed to -+ * vzquota_inode_qmblk_recalc(). -+ */ -+ if (!vzquota_dparents_check_attach(inode)) { -+ if (vzquota_dparents_check_actual(inode) != NULL) -+ break; -+ qmblk = vzquota_dparents_check_same(inode); -+ } else -+ qmblk = VZ_QUOTA_BAD; -+ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */ -+ INODE_QLNK(inode)->origin = VZ_QUOTAO_ACT; -+ break; -+ } -+ } -+ -+out: -+ spin_unlock(&dcache_lock); -+ inode_qmblk_unlock(refinode->i_sb); -+ vzquota_qlnk_destroy(&qlnk); -+ iput(inode); -+ inode_qmblk_lock(refinode->i_sb); -+ spin_lock(&dcache_lock); -+} -+ -+static void vzquota_dtree_qmblk_recalc(struct inode *inode, -+ struct vz_quota_ilink *qlnk) -+{ -+ struct inode *pinode; -+ struct vz_quota_master *qmblk; -+ -+ if (inode == inode->i_sb->s_root->d_inode) { -+ /* filesystem root */ -+ do { -+ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); -+ } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk)); -+ return; -+ } -+ -+start: -+ if (VZ_QUOTA_IS_ACTUAL(inode)) -+ return; -+ /* -+ * Here qmblk is (re-)initialized for all ancestors. -+ * This is not a very efficient procedure, but it guarantees that -+ * the quota tree is consistent (that is, the inode doesn't have two -+ * ancestors with different qmblk). -+ */ -+ if (!vzquota_dparents_check_attach(inode)) { -+ pinode = vzquota_dparents_check_actual(inode); -+ if (pinode != NULL) { -+ vzquota_dbranch_actualize(pinode, inode); -+ goto start; -+ } -+ qmblk = vzquota_dparents_check_same(inode); -+ } else -+ qmblk = VZ_QUOTA_BAD; -+ -+ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) -+ goto start; -+ INODE_QLNK(inode)->origin = VZ_QUOTAO_DTREE; -+} -+ -+static void vzquota_det_qmblk_recalc(struct inode *inode, -+ struct vz_quota_ilink *qlnk) -+{ -+ struct inode *parent; -+ struct vz_quota_master *qmblk; -+ char *msg; -+ int cnt; -+ time_t timeout; -+ -+ cnt = 0; -+ parent = NULL; -+start: -+ /* -+ * qmblk of detached inodes shouldn't be considered as not actual. -+ * They are not in any dentry tree, so quota on/off shouldn't affect -+ * them. -+ */ -+ if (!vzquota_qlnk_is_empty(INODE_QLNK(inode))) -+ return; -+ -+ timeout = 3; -+ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); -+ msg = "detached inode not in creation"; -+ if (inode->i_op != VZ_QUOTA_EMPTY_IOPS) -+ goto fail; -+ qmblk = VZ_QUOTA_BAD; -+ msg = "unexpected creation context"; -+ if (!vzquota_cur_qmblk_check()) -+ goto fail; -+ timeout = 0; -+ parent = vzquota_cur_qmblk_fetch(); -+ msg = "uninitialized parent"; -+ if (vzquota_qlnk_is_empty(INODE_QLNK(parent))) -+ goto fail; -+ msg = "parent not in tree"; -+ if (list_empty(&parent->i_dentry)) -+ goto fail; -+ msg = "parent has 0 refcount"; -+ if (!atomic_read(&parent->i_count)) -+ goto fail; -+ msg = "parent has different sb"; -+ if (parent->i_sb != inode->i_sb) -+ goto fail; -+ if (!VZ_QUOTA_IS_ACTUAL(parent)) { -+ vzquota_dbranch_actualize(parent, inode); -+ goto start; -+ } -+ -+ qmblk = INODE_QLNK(parent)->qmblk; -+set: -+ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) -+ goto start; -+ INODE_QLNK(inode)->origin = VZ_QUOTAO_DET; -+ return; -+ -+fail: -+ { -+ struct timeval tv, tvo; -+ do_gettimeofday(&tv); -+ memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo)); -+ tv.tv_sec -= tvo.tv_sec; -+ if (tv.tv_usec < tvo.tv_usec) { -+ tv.tv_sec--; -+ tv.tv_usec += USEC_PER_SEC - tvo.tv_usec; -+ } else -+ tv.tv_usec -= tvo.tv_usec; -+ if (tv.tv_sec < timeout) -+ goto set; -+ printk(KERN_ERR "VZDQ: %s, orig %u," -+ " dev %s, inode %lu, fs %s\n", -+ msg, INODE_QLNK(inode)->origin, -+ inode->i_sb->s_id, inode->i_ino, -+ inode->i_sb->s_type->name); -+ if (!cnt++) { -+ printk(KERN_ERR "current %d (%s), VE %d," -+ " time %ld.%06ld\n", -+ current->pid, current->comm, -+ VEID(get_exec_env()), -+ tv.tv_sec, tv.tv_usec); -+ dump_stack(); -+ } -+ if (parent != NULL) -+ printk(KERN_ERR "VZDQ: parent of %lu is %lu\n", -+ inode->i_ino, parent->i_ino); -+ } -+ goto set; -+} -+ -+static void vzquota_inode_qmblk_recalc(struct inode *inode, -+ struct vz_quota_ilink *qlnk) -+{ -+ spin_lock(&dcache_lock); -+ if (!list_empty(&inode->i_dentry)) -+ vzquota_dtree_qmblk_recalc(inode, qlnk); -+ else -+ vzquota_det_qmblk_recalc(inode, qlnk); -+ spin_unlock(&dcache_lock); -+} -+ -+/** -+ * vzquota_inode_qmblk - obtain inode's qmblk -+ * -+ * Returns qmblk with refcounter taken, %NULL if not under -+ * VZ quota or %VZ_QUOTA_BAD. -+ * -+ * FIXME: This function should be removed when vzquota_find_qmblk / -+ * get_quota_root / vzquota_dstat code is cleaned up. -+ */ -+struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ilink qlnk; -+ -+ might_sleep(); -+ -+ if (inode->i_sb->dq_op != &vz_quota_operations) -+ return NULL; -+#if defined(VZ_QUOTA_UNLOAD) -+#error Make sure qmblk does not disappear -+#endif -+ -+ vzquota_qlnk_init(&qlnk); -+ inode_qmblk_lock(inode->i_sb); -+ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); -+ -+ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || -+ !VZ_QUOTA_IS_ACTUAL(inode)) -+ vzquota_inode_qmblk_recalc(inode, &qlnk); -+ -+ qmblk = INODE_QLNK(inode)->qmblk; -+ if (qmblk != VZ_QUOTA_BAD) { -+ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) -+ qmblk_get(qmblk); -+ else -+ qmblk = NULL; -+ } -+ -+ inode_qmblk_unlock(inode->i_sb); -+ vzquota_qlnk_destroy(&qlnk); -+ return qmblk; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Calls from quota operations -+ * -+ * --------------------------------------------------------------------- */ -+ -+/** -+ * vzquota_inode_init_call - call from DQUOT_INIT -+ */ -+void vzquota_inode_init_call(struct inode *inode) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_datast data; -+ -+ /* initializes inode's quota inside */ -+ qmblk = vzquota_inode_data(inode, &data); -+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) -+ vzquota_data_unlock(inode, &data); -+ -+ /* -+ * The check is needed for repeated new_inode() calls from a single -+ * ext3 call like create or mkdir in case of -ENOSPC. -+ */ -+ spin_lock(&dcache_lock); -+ if (!list_empty(&inode->i_dentry)) -+ vzquota_cur_qmblk_set(inode); -+ spin_unlock(&dcache_lock); -+} -+ -+/** -+ * vzquota_inode_drop_call - call from DQUOT_DROP -+ */ -+void vzquota_inode_drop_call(struct inode *inode) -+{ -+ vzquota_inode_drop(inode); -+} -+ -+/** -+ * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs -+ * @inode: the inode -+ * @data: storage space -+ * -+ * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk. -+ * On return if qmblk is neither NULL nor VZ_QUOTA_BAD: -+ * qmblk in inode's qlnk is the same as returned, -+ * ugid pointers inside inode's qlnk are valid, -+ * some locks are taken (and should be released by vzquota_data_unlock). -+ * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken. -+ */ -+struct vz_quota_master *vzquota_inode_data(struct inode *inode, -+ struct vz_quota_datast *data) -+{ -+ struct vz_quota_master *qmblk; -+ -+ might_sleep(); -+ -+ vzquota_qlnk_init(&data->qlnk); -+ inode_qmblk_lock(inode->i_sb); -+ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); -+ -+ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || -+ !VZ_QUOTA_IS_ACTUAL(inode)) -+ vzquota_inode_qmblk_recalc(inode, &data->qlnk); -+ -+ qmblk = INODE_QLNK(inode)->qmblk; -+ if (qmblk != VZ_QUOTA_BAD) { -+ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) { -+ /* -+ * Note that in the current implementation, -+ * inode_qmblk_lock can theoretically be dropped here. -+ * This place is serialized with quota_off because -+ * quota_off fails when there are extra dentry -+ * references and syncs inodes before removing quota -+ * information from them. -+ * However, quota usage information should stop being -+ * updated immediately after vzquota_off. -+ */ -+ qmblk_data_write_lock(qmblk); -+ } else { -+ inode_qmblk_unlock(inode->i_sb); -+ qmblk = NULL; -+ } -+ } else { -+ inode_qmblk_unlock(inode->i_sb); -+ } -+ return qmblk; -+} -+ -+void vzquota_data_unlock(struct inode *inode, -+ struct vz_quota_datast *data) -+{ -+ qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk); -+ inode_qmblk_unlock(inode->i_sb); -+ vzquota_qlnk_destroy(&data->qlnk); -+} -+ -+#if defined(CONFIG_VZ_QUOTA_UGID) -+/** -+ * vzquota_inode_transfer_call - call from vzquota_transfer -+ */ -+int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_datast data; -+ struct vz_quota_ilink qlnew; -+ int mask; -+ int ret; -+ -+ might_sleep(); -+ vzquota_qlnk_init(&qlnew); -+start: -+ qmblk = vzquota_inode_data(inode, &data); -+ ret = NO_QUOTA; -+ if (qmblk == VZ_QUOTA_BAD) -+ goto out_destr; -+ ret = QUOTA_OK; -+ if (qmblk == NULL) -+ goto out_destr; -+ qmblk_get(qmblk); -+ -+ ret = QUOTA_OK; -+ if (!(qmblk->dq_flags & VZDQUG_ON)) -+ /* no ugid quotas */ -+ goto out_unlock; -+ -+ mask = 0; -+ if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid) -+ mask |= 1 << USRQUOTA; -+ if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid) -+ mask |= 1 << GRPQUOTA; -+ while (1) { -+ if (vzquota_qlnk_is_empty(&qlnew) && -+ vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk)) -+ break; -+ if (qlnew.qmblk == INODE_QLNK(inode)->qmblk && -+ qlnew.qmblk == qmblk) -+ goto finish; -+ if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk)) -+ break; -+ } -+ -+ /* prepare for restart */ -+ vzquota_data_unlock(inode, &data); -+ qmblk_put(qmblk); -+ goto start; -+ -+finish: -+ /* all references obtained successfully */ -+ ret = vzquota_transfer_usage(inode, mask, &qlnew); -+ if (!ret) { -+ vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode)); -+ INODE_QLNK(inode)->origin = VZ_QUOTAO_TRANS; -+ } -+out_unlock: -+ vzquota_data_unlock(inode, &data); -+ qmblk_put(qmblk); -+out_destr: -+ vzquota_qlnk_destroy(&qlnew); -+ return ret; -+} -+#endif -+ -+int vzquota_rename_check(struct inode *inode, -+ struct inode *old_dir, struct inode *new_dir) -+{ -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ilink qlnk1, qlnk2; -+ int c, ret; -+ -+ if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb) -+ return -1; -+ -+ might_sleep(); -+ -+ vzquota_qlnk_init(&qlnk1); -+ vzquota_qlnk_init(&qlnk2); -+ inode_qmblk_lock(inode->i_sb); -+ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); -+ __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL); -+ __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL); -+ -+ do { -+ c = 0; -+ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || -+ !VZ_QUOTA_IS_ACTUAL(inode)) { -+ vzquota_inode_qmblk_recalc(inode, &qlnk1); -+ c++; -+ } -+ if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) || -+ !VZ_QUOTA_IS_ACTUAL(new_dir)) { -+ vzquota_inode_qmblk_recalc(new_dir, &qlnk2); -+ c++; -+ } -+ } while (c); -+ -+ ret = 0; -+ qmblk = INODE_QLNK(inode)->qmblk; -+ if (qmblk != INODE_QLNK(new_dir)->qmblk) { -+ ret = -1; -+ if (qmblk != VZ_QUOTA_BAD && -+ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && -+ qmblk->dq_root_dentry->d_inode == inode && -+ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk, -+ inode->i_sb) && -+ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk, -+ inode->i_sb)) -+ /* quota root rename is allowed */ -+ ret = 0; -+ } -+ -+ inode_qmblk_unlock(inode->i_sb); -+ vzquota_qlnk_destroy(&qlnk2); -+ vzquota_qlnk_destroy(&qlnk1); -+ return ret; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * qmblk-related parts of on/off operations -+ * -+ * --------------------------------------------------------------------- */ -+ -+/** -+ * vzquota_check_dtree - check dentry tree if quota on/off is allowed -+ * -+ * This function doesn't allow quota to be turned on/off if some dentries in -+ * the tree have external references. -+ * In addition to technical reasons, it enforces user-space correctness: -+ * current usage (taken from or reported to the user space) can be meaningful -+ * and accurate only if the tree is not being modified. -+ * Side effect: additional vfsmount structures referencing the tree (bind -+ * mounts of tree nodes to some other places) are not allowed at on/off time. -+ */ -+int vzquota_check_dtree(struct vz_quota_master *qmblk, int off) -+{ -+ struct dentry *dentry; -+ int err, count; -+ -+ err = -EBUSY; -+ dentry = qmblk->dq_root_dentry; -+ -+ if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root) -+ goto unhashed; -+ -+ /* attempt to shrink */ -+ if (!list_empty(&dentry->d_subdirs)) { -+ spin_unlock(&dcache_lock); -+ inode_qmblk_unlock(dentry->d_sb); -+ shrink_dcache_parent(dentry); -+ inode_qmblk_lock(dentry->d_sb); -+ spin_lock(&dcache_lock); -+ if (!list_empty(&dentry->d_subdirs)) -+ goto out; -+ -+ count = 1; -+ if (dentry == dentry->d_sb->s_root) -+ count += 2; /* sb and mnt refs */ -+ if (atomic_read(&dentry->d_count) < count) { -+ printk(KERN_ERR "%s: too small count %d vs %d.\n", -+ __FUNCTION__, -+ atomic_read(&dentry->d_count), count); -+ goto out; -+ } -+ if (atomic_read(&dentry->d_count) > count) -+ goto out; -+ } -+ -+ err = 0; -+out: -+ return err; -+ -+unhashed: -+ /* -+ * Quota root is removed. -+ * Allow to turn quota off, but not on. -+ */ -+ if (off) -+ err = 0; -+ goto out; -+} -+ -+int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, -+ struct vz_quota_master *qmblk) -+{ -+ struct vz_quota_ilink qlnk; -+ struct vz_quota_master *qold, *qnew; -+ int err; -+ -+ might_sleep(); -+ -+ qold = NULL; -+ qnew = vzquota_alloc_fake(); -+ if (qnew == NULL) -+ return -ENOMEM; -+ -+ vzquota_qlnk_init(&qlnk); -+ inode_qmblk_lock(sb); -+ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); -+ -+ spin_lock(&dcache_lock); -+ while (1) { -+ err = vzquota_check_dtree(qmblk, 0); -+ if (err) -+ break; -+ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)) -+ break; -+ } -+ INODE_QLNK(inode)->origin = VZ_QUOTAO_ON; -+ spin_unlock(&dcache_lock); -+ -+ if (!err) { -+ qold = __VZ_QUOTA_NOQUOTA(sb); -+ qold->dq_flags |= VZDQ_NOACT; -+ __VZ_QUOTA_NOQUOTA(sb) = qnew; -+ } -+ -+ inode_qmblk_unlock(sb); -+ vzquota_qlnk_destroy(&qlnk); -+ if (qold != NULL) -+ qmblk_put(qold); -+ -+ return err; -+} -+ -+int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk) -+{ -+ int ret; -+ -+ ret = 0; -+ inode_qmblk_lock(sb); -+ -+ spin_lock(&dcache_lock); -+ if (vzquota_check_dtree(qmblk, 1)) -+ ret = -EBUSY; -+ spin_unlock(&dcache_lock); -+ -+ if (!ret) -+ qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT; -+ inode_qmblk_unlock(sb); -+ return ret; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * External interfaces -+ * -+ * ---------------------------------------------------------------------*/ -+ -+static int vzquota_ioctl(struct inode *ino, struct file *file, -+ unsigned int cmd, unsigned long arg) -+{ -+ int err; -+ struct vzctl_quotactl qb; -+ struct vzctl_quotaugidctl qub; -+ -+ switch (cmd) { -+ case VZCTL_QUOTA_CTL: -+ err = -ENOTTY; -+ break; -+ case VZCTL_QUOTA_NEW_CTL: -+ err = -EFAULT; -+ if (copy_from_user(&qb, (void *)arg, sizeof(qb))) -+ break; -+ err = do_vzquotactl(qb.cmd, qb.quota_id, -+ qb.qstat, qb.ve_root); -+ break; -+#ifdef CONFIG_VZ_QUOTA_UGID -+ case VZCTL_QUOTA_UGID_CTL: -+ err = -EFAULT; -+ if (copy_from_user(&qub, (void *)arg, sizeof(qub))) -+ break; -+ err = do_vzquotaugidctl(&qub); -+ break; -+#endif -+ default: -+ err = -ENOTTY; -+ } -+ might_sleep(); /* debug */ -+ return err; -+} -+ -+static struct vzioctlinfo vzdqcalls = { -+ .type = VZDQCTLTYPE, -+ .func = vzquota_ioctl, -+ .owner = THIS_MODULE, -+}; -+ -+/** -+ * vzquota_dstat - get quota usage info for virtual superblock -+ */ -+static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat) -+{ -+ struct vz_quota_master *qmblk; -+ -+ qmblk = vzquota_find_qmblk(super); -+ if (qmblk == NULL) -+ return -ENOENT; -+ if (qmblk == VZ_QUOTA_BAD) { -+ memset(qstat, 0, sizeof(*qstat)); -+ return 0; -+ } -+ -+ qmblk_data_read_lock(qmblk); -+ memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat)); -+ qmblk_data_read_unlock(qmblk); -+ qmblk_put(qmblk); -+ return 0; -+} -+ -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Init/exit helpers -+ * -+ * ---------------------------------------------------------------------*/ -+ -+static int vzquota_cache_init(void) -+{ -+ int i; -+ -+ vzquota_cachep = kmem_cache_create("vz_quota_master", -+ sizeof(struct vz_quota_master), -+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); -+ if (vzquota_cachep == NULL) { -+ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); -+ goto nomem2; -+ } -+ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) -+ INIT_LIST_HEAD(&vzquota_hash_table[i]); -+ -+ return 0; -+ -+nomem2: -+ return -ENOMEM; -+} -+ -+static void vzquota_cache_release(void) -+{ -+ int i; -+ -+ /* sanity check */ -+ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) -+ if (!list_empty(&vzquota_hash_table[i])) -+ BUG(); -+ -+ /* release caches */ -+ if (kmem_cache_destroy(vzquota_cachep)) -+ printk(KERN_ERR -+ "VZQUOTA: vz_quota_master kmem_cache_destroy failed\n"); -+ vzquota_cachep = NULL; -+} -+ -+static int quota_notifier_call(struct vnotifier_block *self, -+ unsigned long n, void *data, int err) -+{ -+ struct virt_info_quota *viq; -+ struct super_block *sb; -+ -+ viq = (struct virt_info_quota *)data; -+ switch (n) { -+ case VIRTINFO_QUOTA_ON: -+ err = NOTIFY_BAD; -+ if (!try_module_get(THIS_MODULE)) -+ break; -+ sb = viq->super; -+ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); -+ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); -+ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); -+ err = NOTIFY_OK; -+ break; -+ case VIRTINFO_QUOTA_OFF: -+ module_put(THIS_MODULE); -+ err = NOTIFY_OK; -+ break; -+ case VIRTINFO_QUOTA_GETSTAT: -+ err = NOTIFY_BAD; -+ if (vzquota_dstat(viq->super, viq->qstat)) -+ break; -+ err = NOTIFY_OK; -+ break; -+ } -+ return err; -+} -+ -+struct vnotifier_block quota_notifier_block = { -+ .notifier_call = quota_notifier_call, -+ .priority = INT_MAX, -+}; -+ -+/* ---------------------------------------------------------------------- -+ * -+ * Init/exit procedures -+ * -+ * ---------------------------------------------------------------------*/ -+ -+static int __init vzquota_init(void) -+{ -+ int err; -+ -+ if ((err = vzquota_cache_init()) != 0) -+ goto out_cache; -+ -+ if ((err = vzquota_proc_init()) != 0) -+ goto out_proc; -+ -+#ifdef CONFIG_VZ_QUOTA_UGID -+ if ((err = vzquota_ugid_init()) != 0) -+ goto out_ugid; -+#endif -+ -+ init_MUTEX(&vz_quota_sem); -+ vzioctl_register(&vzdqcalls); -+ virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); -+#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) -+ vzaquota_init(); -+#endif -+ -+ return 0; -+ -+#ifdef CONFIG_VZ_QUOTA_UGID -+out_ugid: -+ vzquota_proc_release(); -+#endif -+out_proc: -+ vzquota_cache_release(); -+out_cache: -+ return err; -+} -+ -+#if defined(VZ_QUOTA_UNLOAD) -+static void __exit vzquota_release(void) -+{ -+ virtinfo_notifier_unregister(VITYPE_QUOTA, "a_notifier_block); -+ vzioctl_unregister(&vzdqcalls); -+#ifdef CONFIG_VZ_QUOTA_UGID -+#ifdef CONFIG_PROC_FS -+ vzaquota_fini(); -+#endif -+ vzquota_ugid_release(); -+#endif -+ vzquota_proc_release(); -+ vzquota_cache_release(); -+} -+#endif -+ -+MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); -+MODULE_DESCRIPTION("Virtuozzo Disk Quota"); -+MODULE_LICENSE("GPL v2"); -+ -+module_init(vzquota_init) -+#if defined(VZ_QUOTA_UNLOAD) -+module_exit(vzquota_release) -+#endif -diff -upr linux-2.6.16.orig/fs/xattr.c linux-2.6.16-026test009/fs/xattr.c ---- linux-2.6.16.orig/fs/xattr.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/xattr.c 2006-04-19 15:02:11.000000000 +0400 -@@ -58,7 +58,7 @@ xattr_permission(struct inode *inode, co - return -EPERM; - } - -- return permission(inode, mask, NULL); -+ return permission(inode, mask, NULL, NULL); - } - - int -diff -upr linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_aops.c linux-2.6.16-026test009/fs/xfs/linux-2.6/xfs_aops.c ---- linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_aops.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/xfs/linux-2.6/xfs_aops.c 2006-04-19 15:02:11.000000000 +0400 -@@ -616,7 +616,7 @@ xfs_is_delayed_page( - acceptable = (type == IOMAP_UNWRITTEN); - else if (buffer_delay(bh)) - acceptable = (type == IOMAP_DELAY); -- else if (buffer_mapped(bh)) -+ else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == 0); - else - break; -diff -upr linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_iops.c linux-2.6.16-026test009/fs/xfs/linux-2.6/xfs_iops.c ---- linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_iops.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/fs/xfs/linux-2.6/xfs_iops.c 2006-04-19 15:02:11.000000000 +0400 -@@ -615,7 +615,8 @@ STATIC int - linvfs_permission( - struct inode *inode, - int mode, -- struct nameidata *nd) -+ struct nameidata *nd, -+ struct exec_perm *perm) - { - vnode_t *vp = LINVFS_GET_VP(inode); - int error; -@@ -673,8 +674,7 @@ linvfs_setattr( - if (ia_valid & ATTR_ATIME) { - vattr.va_mask |= XFS_AT_ATIME; - vattr.va_atime = attr->ia_atime; -- if (ia_valid & ATTR_ATIME_SET) -- inode->i_atime = attr->ia_atime; -+ inode->i_atime = attr->ia_atime; - } - if (ia_valid & ATTR_MTIME) { - vattr.va_mask |= XFS_AT_MTIME; -diff -upr linux-2.6.16.orig/include/asm-arm26/tlbflush.h linux-2.6.16-026test009/include/asm-arm26/tlbflush.h ---- linux-2.6.16.orig/include/asm-arm26/tlbflush.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-arm26/tlbflush.h 2006-04-19 15:02:12.000000000 +0400 -@@ -25,7 +25,7 @@ static inline void memc_update_all(void) - { - struct task_struct *p; - cpu_memc_update_all(init_mm.pgd); -- for_each_process(p) { -+ for_each_process_all(p) { - if (!p->mm) - continue; - cpu_memc_update_all(p->mm->pgd); -diff -upr linux-2.6.16.orig/include/asm-generic/atomic.h linux-2.6.16-026test009/include/asm-generic/atomic.h ---- linux-2.6.16.orig/include/asm-generic/atomic.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-generic/atomic.h 2006-04-19 15:02:12.000000000 +0400 -@@ -66,6 +66,13 @@ static inline void atomic_long_sub(long - atomic64_sub(i, v); - } - -+static inline int atomic_long_add_negative(long i, atomic_long_t *l) -+{ -+ atomic64_t *v = (atomic64_t *)l; -+ -+ return atomic64_add_negative(i, v); -+} -+ - #else - - typedef atomic_t atomic_long_t; -@@ -113,5 +120,12 @@ static inline void atomic_long_sub(long - atomic_sub(i, v); - } - -+static inline int atomic_long_add_negative(long i, atomic_long_t *l) -+{ -+ atomic_t *v = (atomic_t *)l; -+ -+ return atomic_add_negative(i, v); -+} -+ - #endif - #endif -diff -upr linux-2.6.16.orig/include/asm-i386/bug.h linux-2.6.16-026test009/include/asm-i386/bug.h ---- linux-2.6.16.orig/include/asm-i386/bug.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-i386/bug.h 2006-04-19 15:02:11.000000000 +0400 -@@ -14,7 +14,10 @@ - #ifdef CONFIG_DEBUG_BUGVERBOSE - #define BUG() \ - __asm__ __volatile__( "ud2\n" \ -+ "\t.byte 0x66\n"\ -+ "\t.byte 0xb8\n" /* mov $xxx, %ax */\ - "\t.word %c0\n" \ -+ "\t.byte 0xb8\n" /* mov $xxx, %eax */\ - "\t.long %c1\n" \ - : : "i" (__LINE__), "i" (__FILE__)) - #else -diff -upr linux-2.6.16.orig/include/asm-i386/elf.h linux-2.6.16-026test009/include/asm-i386/elf.h ---- linux-2.6.16.orig/include/asm-i386/elf.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-i386/elf.h 2006-04-19 15:02:12.000000000 +0400 -@@ -108,7 +108,7 @@ typedef struct user_fxsr_struct elf_fpxr - For the moment, we have only optimizations for the Intel generations, - but that could change... */ - --#define ELF_PLATFORM (system_utsname.machine) -+#define ELF_PLATFORM (ve_utsname.machine) - - #ifdef __KERNEL__ - #define SET_PERSONALITY(ex, ibcs2) do { } while (0) -@@ -136,8 +136,10 @@ extern void __kernel_vsyscall; - - #define ARCH_DLINFO \ - do { \ -+ if (sysctl_at_vsyscall) { \ - NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ -+ } \ - } while (0) - - /* -diff -upr linux-2.6.16.orig/include/asm-i386/mman.h linux-2.6.16-026test009/include/asm-i386/mman.h ---- linux-2.6.16.orig/include/asm-i386/mman.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-i386/mman.h 2006-04-19 15:02:11.000000000 +0400 -@@ -10,6 +10,7 @@ - #define MAP_NORESERVE 0x4000 /* don't check for reservations */ - #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ - #define MAP_NONBLOCK 0x10000 /* do not block on IO */ -+#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ - - #define MCL_CURRENT 1 /* lock all current mappings */ - #define MCL_FUTURE 2 /* lock all future mappings */ -diff -upr linux-2.6.16.orig/include/asm-i386/nmi.h linux-2.6.16-026test009/include/asm-i386/nmi.h ---- linux-2.6.16.orig/include/asm-i386/nmi.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-i386/nmi.h 2006-04-19 15:02:11.000000000 +0400 -@@ -17,6 +17,7 @@ typedef int (*nmi_callback_t)(struct pt_ - * set. Return 1 if the NMI was handled. - */ - void set_nmi_callback(nmi_callback_t callback); -+void set_nmi_ipi_callback(nmi_callback_t callback); - - /** - * unset_nmi_callback -@@ -24,5 +25,6 @@ void set_nmi_callback(nmi_callback_t cal - * Remove the handler previously set. - */ - void unset_nmi_callback(void); -+void unset_nmi_ipi_callback(void); - - #endif /* ASM_NMI_H */ -diff -upr linux-2.6.16.orig/include/asm-i386/thread_info.h linux-2.6.16-026test009/include/asm-i386/thread_info.h ---- linux-2.6.16.orig/include/asm-i386/thread_info.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-i386/thread_info.h 2006-04-19 15:02:12.000000000 +0400 -@@ -101,13 +101,13 @@ register unsigned long current_stack_poi - ({ \ - struct thread_info *ret; \ - \ -- ret = kmalloc(THREAD_SIZE, GFP_KERNEL); \ -+ ret = kmalloc(THREAD_SIZE, GFP_KERNEL_UBC); \ - if (ret) \ - memset(ret, 0, THREAD_SIZE); \ - ret; \ - }) - #else --#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) -+#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL_UBC) - #endif - - #define free_thread_info(info) kfree(info) -@@ -142,7 +142,8 @@ register unsigned long current_stack_poi - #define TIF_SECCOMP 8 /* secure computing */ - #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ - #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ --#define TIF_MEMDIE 17 -+#define TIF_FREEZE 17 /* Freeze request, atomic version of PF_FREEZE */ -+#define TIF_MEMDIE 18 - - #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) - #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) -diff -upr linux-2.6.16.orig/include/asm-i386/timex.h linux-2.6.16-026test009/include/asm-i386/timex.h ---- linux-2.6.16.orig/include/asm-i386/timex.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-i386/timex.h 2006-04-19 15:02:12.000000000 +0400 -@@ -36,13 +36,17 @@ static inline cycles_t get_cycles (void) - { - unsigned long long ret=0; - --#ifndef CONFIG_X86_TSC -- if (!cpu_has_tsc) -- return 0; --#endif -- - #if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) - rdtscll(ret); -+#elif defined(CONFIG_VE) -+ /* -+ * get_cycles is used in the following calculations: -+ * - VPS idle and iowait times in kernel/shced.h -+ * - task's sleep time to be shown with SyRq-t -+ * - kstat latencies in linux/vzstat.h -+ * - sched latency via wakeup_stamp in linux/ve_task.h -+ */ -+#warning "some of VPS statistics won't be correct without get_cycles() (kstat_lat, ve_idle, etc)" - #endif - return ret; - } -diff -upr linux-2.6.16.orig/include/asm-i386/unistd.h linux-2.6.16-026test009/include/asm-i386/unistd.h ---- linux-2.6.16.orig/include/asm-i386/unistd.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-i386/unistd.h 2006-04-19 15:02:11.000000000 +0400 -@@ -316,8 +316,11 @@ - #define __NR_pselect6 308 - #define __NR_ppoll 309 - #define __NR_unshare 310 -- --#define NR_syscalls 311 -+#define __NR_getluid 510 -+#define __NR_setluid 511 -+#define __NR_setublimit 512 -+#define __NR_ubstat 513 -+#define NR_syscalls 513 - - /* - * user-visible error numbers are in the range -1 - -128: see -diff -upr linux-2.6.16.orig/include/asm-ia64/mman.h linux-2.6.16-026test009/include/asm-ia64/mman.h ---- linux-2.6.16.orig/include/asm-ia64/mman.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-ia64/mman.h 2006-04-19 15:02:11.000000000 +0400 -@@ -18,6 +18,7 @@ - #define MAP_NORESERVE 0x04000 /* don't check for reservations */ - #define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */ - #define MAP_NONBLOCK 0x10000 /* do not block on IO */ -+#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ - - #define MCL_CURRENT 1 /* lock all current mappings */ - #define MCL_FUTURE 2 /* lock all future mappings */ -diff -upr linux-2.6.16.orig/include/asm-ia64/pgalloc.h linux-2.6.16-026test009/include/asm-ia64/pgalloc.h ---- linux-2.6.16.orig/include/asm-ia64/pgalloc.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-ia64/pgalloc.h 2006-04-19 15:02:11.000000000 +0400 -@@ -20,6 +20,8 @@ - #include <linux/page-flags.h> - #include <linux/threads.h> - -+#include <ub/ub_mem.h> -+ - #include <asm/mmu_context.h> - - DECLARE_PER_CPU(unsigned long *, __pgtable_quicklist); -@@ -38,7 +40,7 @@ static inline long pgtable_quicklist_tot - return ql_size; - } - --static inline void *pgtable_quicklist_alloc(void) -+static inline void *pgtable_quicklist_alloc(int charge) - { - unsigned long *ret = NULL; - -@@ -46,13 +48,19 @@ static inline void *pgtable_quicklist_al - - ret = pgtable_quicklist; - if (likely(ret != NULL)) { -+ if (ub_page_charge(virt_to_page(ret), 0, -+ charge ? __GFP_UBC|__GFP_SOFT_UBC : 0)) -+ goto out; -+ - pgtable_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - --pgtable_quicklist_size; -+out: - preempt_enable(); - } else { - preempt_enable(); -- ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO); -+ ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO | -+ (charge ? __GFP_UBC | __GFP_SOFT_UBC : 0)); - } - - return ret; -@@ -70,6 +78,7 @@ static inline void pgtable_quicklist_fre - #endif - - preempt_disable(); -+ ub_page_uncharge(virt_to_page(pgtable_entry), 0); - *(unsigned long *)pgtable_entry = (unsigned long)pgtable_quicklist; - pgtable_quicklist = (unsigned long *)pgtable_entry; - ++pgtable_quicklist_size; -@@ -78,7 +87,7 @@ static inline void pgtable_quicklist_fre - - static inline pgd_t *pgd_alloc(struct mm_struct *mm) - { -- return pgtable_quicklist_alloc(); -+ return pgtable_quicklist_alloc(1); - } - - static inline void pgd_free(pgd_t * pgd) -@@ -95,7 +104,7 @@ pgd_populate(struct mm_struct *mm, pgd_t - - static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) - { -- return pgtable_quicklist_alloc(); -+ return pgtable_quicklist_alloc(1); - } - - static inline void pud_free(pud_t * pud) -@@ -113,7 +122,7 @@ pud_populate(struct mm_struct *mm, pud_t - - static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) - { -- return pgtable_quicklist_alloc(); -+ return pgtable_quicklist_alloc(1); - } - - static inline void pmd_free(pmd_t * pmd) -@@ -138,13 +147,13 @@ pmd_populate_kernel(struct mm_struct *mm - static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long addr) - { -- return virt_to_page(pgtable_quicklist_alloc()); -+ return virt_to_page(pgtable_quicklist_alloc(1)); - } - - static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long addr) - { -- return pgtable_quicklist_alloc(); -+ return pgtable_quicklist_alloc(0); - } - - static inline void pte_free(struct page *pte) -diff -upr linux-2.6.16.orig/include/asm-ia64/processor.h linux-2.6.16-026test009/include/asm-ia64/processor.h ---- linux-2.6.16.orig/include/asm-ia64/processor.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-ia64/processor.h 2006-04-19 15:02:12.000000000 +0400 -@@ -306,7 +306,7 @@ struct thread_struct { - regs->loadrs = 0; \ - regs->r8 = current->mm->dumpable; /* set "don't zap registers" flag */ \ - regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ -- if (unlikely(!current->mm->dumpable)) { \ -+ if (unlikely(!current->mm->dumpable || !current->mm->vps_dumpable)) { \ - /* \ - * Zap scratch regs to avoid leaking bits between processes with different \ - * uid/privileges. \ -diff -upr linux-2.6.16.orig/include/asm-ia64/unistd.h linux-2.6.16-026test009/include/asm-ia64/unistd.h ---- linux-2.6.16.orig/include/asm-ia64/unistd.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-ia64/unistd.h 2006-04-19 15:02:11.000000000 +0400 -@@ -285,12 +285,17 @@ - #define __NR_faccessat 1293 - /* 1294, 1295 reserved for pselect/ppoll */ - #define __NR_unshare 1296 -+#define __NR_getluid 1505 -+#define __NR_setluid 1506 -+#define __NR_setublimit 1507 -+#define __NR_ubstat 1508 - - #ifdef __KERNEL__ - - #include <linux/config.h> - --#define NR_syscalls 273 /* length of syscall table */ -+/* length of syscall table */ -+#define NR_syscalls (__NR_ubstat - __NR_ni_syscall + 1) - - #define __ARCH_WANT_SYS_RT_SIGACTION - -diff -upr linux-2.6.16.orig/include/asm-m32r/smp.h linux-2.6.16-026test009/include/asm-m32r/smp.h ---- linux-2.6.16.orig/include/asm-m32r/smp.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-m32r/smp.h 2006-04-19 15:02:11.000000000 +0400 -@@ -67,7 +67,8 @@ extern volatile int cpu_2_physid[NR_CPUS - #define raw_smp_processor_id() (current_thread_info()->cpu) - - extern cpumask_t cpu_callout_map; --#define cpu_possible_map cpu_callout_map -+extern cpumask_t cpu_possible_map; -+extern cpumask_t cpu_present_map; - - static __inline__ int hard_smp_processor_id(void) - { -diff -upr linux-2.6.16.orig/include/asm-m32r/uaccess.h linux-2.6.16-026test009/include/asm-m32r/uaccess.h ---- linux-2.6.16.orig/include/asm-m32r/uaccess.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-m32r/uaccess.h 2006-04-19 15:02:11.000000000 +0400 -@@ -5,17 +5,9 @@ - * linux/include/asm-m32r/uaccess.h - * - * M32R version. -- * Copyright (C) 2004 Hirokazu Takata <takata at linux-m32r.org> -+ * Copyright (C) 2004, 2006 Hirokazu Takata <takata at linux-m32r.org> - */ - --#undef UACCESS_DEBUG -- --#ifdef UACCESS_DEBUG --#define UAPRINTK(args...) printk(args) --#else --#define UAPRINTK(args...) --#endif /* UACCESS_DEBUG */ -- - /* - * User space memory access functions - */ -@@ -38,27 +30,29 @@ - #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - - #ifdef CONFIG_MMU -+ - #define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) - #define USER_DS MAKE_MM_SEG(PAGE_OFFSET) --#else --#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) --#define USER_DS MAKE_MM_SEG(0xFFFFFFFF) --#endif /* CONFIG_MMU */ -- - #define get_ds() (KERNEL_DS) --#ifdef CONFIG_MMU - #define get_fs() (current_thread_info()->addr_limit) - #define set_fs(x) (current_thread_info()->addr_limit = (x)) --#else -+ -+#else /* not CONFIG_MMU */ -+ -+#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) -+#define USER_DS MAKE_MM_SEG(0xFFFFFFFF) -+#define get_ds() (KERNEL_DS) -+ - static inline mm_segment_t get_fs(void) - { -- return USER_DS; -+ return USER_DS; - } - - static inline void set_fs(mm_segment_t s) - { - } --#endif /* CONFIG_MMU */ -+ -+#endif /* not CONFIG_MMU */ - - #define segment_eq(a,b) ((a).seg == (b).seg) - -@@ -83,9 +77,9 @@ static inline void set_fs(mm_segment_t s - " subx %0, %0\n" \ - " cmpu %4, %1\n" \ - " subx %0, %5\n" \ -- : "=&r"(flag), "=r"(sum) \ -- : "1"(addr), "r"((int)(size)), \ -- "r"(current_thread_info()->addr_limit.seg), "r"(0) \ -+ : "=&r" (flag), "=r" (sum) \ -+ : "1" (addr), "r" ((int)(size)), \ -+ "r" (current_thread_info()->addr_limit.seg), "r" (0) \ - : "cbit" ); \ - flag; }) - -@@ -113,10 +107,10 @@ static inline void set_fs(mm_segment_t s - #else - static inline int access_ok(int type, const void *addr, unsigned long size) - { -- extern unsigned long memory_start, memory_end; -- unsigned long val = (unsigned long)addr; -+ extern unsigned long memory_start, memory_end; -+ unsigned long val = (unsigned long)addr; - -- return ((val >= memory_start) && ((val + size) < memory_end)); -+ return ((val >= memory_start) && ((val + size) < memory_end)); - } - #endif /* CONFIG_MMU */ - -@@ -155,39 +149,6 @@ extern int fixup_exception(struct pt_reg - * accesses to the same area of user memory). - */ - --extern void __get_user_1(void); --extern void __get_user_2(void); --extern void __get_user_4(void); -- --#ifndef MODULE --#define __get_user_x(size,ret,x,ptr) \ -- __asm__ __volatile__( \ -- " mv r0, %0\n" \ -- " mv r1, %1\n" \ -- " bl __get_user_" #size "\n" \ -- " mv %0, r0\n" \ -- " mv %1, r1\n" \ -- : "=r"(ret), "=r"(x) \ -- : "0"(ptr) \ -- : "r0", "r1", "r14" ) --#else /* MODULE */ --/* -- * Use "jl" instead of "bl" for MODULE -- */ --#define __get_user_x(size,ret,x,ptr) \ -- __asm__ __volatile__( \ -- " mv r0, %0\n" \ -- " mv r1, %1\n" \ -- " seth lr, #high(__get_user_" #size ")\n" \ -- " or3 lr, lr, #low(__get_user_" #size ")\n" \ -- " jl lr\n" \ -- " mv %0, r0\n" \ -- " mv %1, r1\n" \ -- : "=r"(ret), "=r"(x) \ -- : "0"(ptr) \ -- : "r0", "r1", "r14" ) --#endif -- - /* Careful: we have to cast the result to the type of the pointer for sign - reasons */ - /** -@@ -208,20 +169,7 @@ extern void __get_user_4(void); - * On error, the variable @x is set to zero. - */ - #define get_user(x,ptr) \ --({ int __ret_gu; \ -- unsigned long __val_gu; \ -- __chk_user_ptr(ptr); \ -- switch(sizeof (*(ptr))) { \ -- case 1: __get_user_x(1,__ret_gu,__val_gu,ptr); break; \ -- case 2: __get_user_x(2,__ret_gu,__val_gu,ptr); break; \ -- case 4: __get_user_x(4,__ret_gu,__val_gu,ptr); break; \ -- default: __get_user_x(X,__ret_gu,__val_gu,ptr); break; \ -- } \ -- (x) = (__typeof__(*(ptr)))__val_gu; \ -- __ret_gu; \ --}) -- --extern void __put_user_bad(void); -+ __get_user_check((x),(ptr),sizeof(*(ptr))) - - /** - * put_user: - Write a simple value into user space. -@@ -240,8 +188,7 @@ extern void __put_user_bad(void); - * Returns zero on success, or -EFAULT on error. - */ - #define put_user(x,ptr) \ -- __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) -- -+ __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) - - /** - * __get_user: - Get a simple variable from user space, with less checking. -@@ -264,8 +211,64 @@ extern void __put_user_bad(void); - * On error, the variable @x is set to zero. - */ - #define __get_user(x,ptr) \ -- __get_user_nocheck((x),(ptr),sizeof(*(ptr))) -+ __get_user_nocheck((x),(ptr),sizeof(*(ptr))) - -+#define __get_user_nocheck(x,ptr,size) \ -+({ \ -+ long __gu_err = 0; \ -+ unsigned long __gu_val; \ -+ might_sleep(); \ -+ __get_user_size(__gu_val,(ptr),(size),__gu_err); \ -+ (x) = (__typeof__(*(ptr)))__gu_val; \ -+ __gu_err; \ -+}) -+ -+#define __get_user_check(x,ptr,size) \ -+({ \ -+ long __gu_err = -EFAULT; \ -+ unsigned long __gu_val = 0; \ -+ const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ -+ might_sleep(); \ -+ if (access_ok(VERIFY_READ,__gu_addr,size)) \ -+ __get_user_size(__gu_val,__gu_addr,(size),__gu_err); \ -+ (x) = (__typeof__(*(ptr)))__gu_val; \ -+ __gu_err; \ -+}) -+ -+extern long __get_user_bad(void); -+ -+#define __get_user_size(x,ptr,size,retval) \ -+do { \ -+ retval = 0; \ -+ __chk_user_ptr(ptr); \ -+ switch (size) { \ -+ case 1: __get_user_asm(x,ptr,retval,"ub"); break; \ -+ case 2: __get_user_asm(x,ptr,retval,"uh"); break; \ -+ case 4: __get_user_asm(x,ptr,retval,""); break; \ -+ default: (x) = __get_user_bad(); \ -+ } \ -+} while (0) -+ -+#define __get_user_asm(x, addr, err, itype) \ -+ __asm__ __volatile__( \ -+ " .fillinsn\n" \ -+ "1: ld"itype" %1,@%2\n" \ -+ " .fillinsn\n" \ -+ "2:\n" \ -+ ".section .fixup,\"ax\"\n" \ -+ " .balign 4\n" \ -+ "3: ldi %0,%3\n" \ -+ " seth r14,#high(2b)\n" \ -+ " or3 r14,r14,#low(2b)\n" \ -+ " jmp r14\n" \ -+ ".previous\n" \ -+ ".section __ex_table,\"a\"\n" \ -+ " .balign 4\n" \ -+ " .long 1b,3b\n" \ -+ ".previous" \ -+ : "=&r" (err), "=&r" (x) \ -+ : "r" (addr), "i" (-EFAULT), "0" (err) \ -+ : "r14", "memory") - - /** - * __put_user: - Write a simple value into user space, with less checking. -@@ -287,11 +290,13 @@ extern void __put_user_bad(void); - * Returns zero on success, or -EFAULT on error. - */ - #define __put_user(x,ptr) \ -- __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) -+ __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) -+ - - #define __put_user_nocheck(x,ptr,size) \ - ({ \ - long __pu_err; \ -+ might_sleep(); \ - __put_user_size((x),(ptr),(size),__pu_err); \ - __pu_err; \ - }) -@@ -308,28 +313,28 @@ extern void __put_user_bad(void); - }) - - #if defined(__LITTLE_ENDIAN__) --#define __put_user_u64(x, addr, err) \ -- __asm__ __volatile__( \ -- " .fillinsn\n" \ -- "1: st %L1,@%2\n" \ -- " .fillinsn\n" \ -- "2: st %H1,@(4,%2)\n" \ -- " .fillinsn\n" \ -- "3:\n" \ -- ".section .fixup,\"ax\"\n" \ -- " .balign 4\n" \ -- "4: ldi %0,%3\n" \ -- " seth r14,#high(3b)\n" \ -- " or3 r14,r14,#low(3b)\n" \ -- " jmp r14\n" \ -- ".previous\n" \ -- ".section __ex_table,\"a\"\n" \ -- " .balign 4\n" \ -- " .long 1b,4b\n" \ -- " .long 2b,4b\n" \ -- ".previous" \ -- : "=&r"(err) \ -- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \ -+#define __put_user_u64(x, addr, err) \ -+ __asm__ __volatile__( \ -+ " .fillinsn\n" \ -+ "1: st %L1,@%2\n" \ -+ " .fillinsn\n" \ -+ "2: st %H1,@(4,%2)\n" \ -+ " .fillinsn\n" \ -+ "3:\n" \ -+ ".section .fixup,\"ax\"\n" \ -+ " .balign 4\n" \ -+ "4: ldi %0,%3\n" \ -+ " seth r14,#high(3b)\n" \ -+ " or3 r14,r14,#low(3b)\n" \ -+ " jmp r14\n" \ -+ ".previous\n" \ -+ ".section __ex_table,\"a\"\n" \ -+ " .balign 4\n" \ -+ " .long 1b,4b\n" \ -+ " .long 2b,4b\n" \ -+ ".previous" \ -+ : "=&r" (err) \ -+ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \ - : "r14", "memory") - - #elif defined(__BIG_ENDIAN__) -@@ -353,13 +358,15 @@ extern void __put_user_bad(void); - " .long 1b,4b\n" \ - " .long 2b,4b\n" \ - ".previous" \ -- : "=&r"(err) \ -- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \ -+ : "=&r" (err) \ -+ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \ - : "r14", "memory") - #else - #error no endian defined - #endif - -+extern void __put_user_bad(void); -+ - #define __put_user_size(x,ptr,size,retval) \ - do { \ - retval = 0; \ -@@ -398,52 +405,8 @@ struct __large_struct { unsigned long bu - " .balign 4\n" \ - " .long 1b,3b\n" \ - ".previous" \ -- : "=&r"(err) \ -- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \ -- : "r14", "memory") -- --#define __get_user_nocheck(x,ptr,size) \ --({ \ -- long __gu_err; \ -- unsigned long __gu_val; \ -- __get_user_size(__gu_val,(ptr),(size),__gu_err); \ -- (x) = (__typeof__(*(ptr)))__gu_val; \ -- __gu_err; \ --}) -- --extern long __get_user_bad(void); -- --#define __get_user_size(x,ptr,size,retval) \ --do { \ -- retval = 0; \ -- __chk_user_ptr(ptr); \ -- switch (size) { \ -- case 1: __get_user_asm(x,ptr,retval,"ub"); break; \ -- case 2: __get_user_asm(x,ptr,retval,"uh"); break; \ -- case 4: __get_user_asm(x,ptr,retval,""); break; \ -- default: (x) = __get_user_bad(); \ -- } \ --} while (0) -- --#define __get_user_asm(x, addr, err, itype) \ -- __asm__ __volatile__( \ -- " .fillinsn\n" \ -- "1: ld"itype" %1,@%2\n" \ -- " .fillinsn\n" \ -- "2:\n" \ -- ".section .fixup,\"ax\"\n" \ -- " .balign 4\n" \ -- "3: ldi %0,%3\n" \ -- " seth r14,#high(2b)\n" \ -- " or3 r14,r14,#low(2b)\n" \ -- " jmp r14\n" \ -- ".previous\n" \ -- ".section __ex_table,\"a\"\n" \ -- " .balign 4\n" \ -- " .long 1b,3b\n" \ -- ".previous" \ -- : "=&r"(err), "=&r"(x) \ -- : "r"(addr), "i"(-EFAULT), "0"(err) \ -+ : "=&r" (err) \ -+ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \ - : "r14", "memory") - - /* -@@ -453,7 +416,6 @@ do { \ - * anything, so this is accurate. - */ - -- - /* - * Copy To/From Userspace - */ -@@ -511,8 +473,9 @@ do { \ - " .long 2b,9b\n" \ - " .long 3b,9b\n" \ - ".previous\n" \ -- : "=&r"(__dst), "=&r"(__src), "=&r"(size), "=&r"(__c) \ -- : "0"(to), "1"(from), "2"(size), "3"(size / 4) \ -+ : "=&r" (__dst), "=&r" (__src), "=&r" (size), \ -+ "=&r" (__c) \ -+ : "0" (to), "1" (from), "2" (size), "3" (size / 4) \ - : "r14", "memory"); \ - } while (0) - -@@ -573,8 +536,9 @@ do { \ - " .long 2b,7b\n" \ - " .long 3b,7b\n" \ - ".previous\n" \ -- : "=&r"(__dst), "=&r"(__src), "=&r"(size), "=&r"(__c) \ -- : "0"(to), "1"(from), "2"(size), "3"(size / 4) \ -+ : "=&r" (__dst), "=&r" (__src), "=&r" (size), \ -+ "=&r" (__c) \ -+ : "0" (to), "1" (from), "2" (size), "3" (size / 4) \ - : "r14", "memory"); \ - } while (0) - -@@ -676,7 +640,7 @@ unsigned long __generic_copy_from_user(v - #define copy_from_user(to,from,n) \ - ({ \ - might_sleep(); \ --__generic_copy_from_user((to),(from),(n)); \ -+ __generic_copy_from_user((to),(from),(n)); \ - }) - - long __must_check strncpy_from_user(char *dst, const char __user *src, -diff -upr linux-2.6.16.orig/include/asm-powerpc/floppy.h linux-2.6.16-026test009/include/asm-powerpc/floppy.h ---- linux-2.6.16.orig/include/asm-powerpc/floppy.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-powerpc/floppy.h 2006-04-19 15:02:11.000000000 +0400 -@@ -35,6 +35,7 @@ - #ifdef CONFIG_PCI - - #include <linux/pci.h> -+#include <asm/ppc-pci.h> /* for ppc64_isabridge_dev */ - - #define fd_dma_setup(addr,size,mode,io) powerpc_fd_dma_setup(addr,size,mode,io) - -@@ -52,12 +53,12 @@ static __inline__ int powerpc_fd_dma_set - if (bus_addr - && (addr != prev_addr || size != prev_size || dir != prev_dir)) { - /* different from last time -- unmap prev */ -- pci_unmap_single(NULL, bus_addr, prev_size, prev_dir); -+ pci_unmap_single(ppc64_isabridge_dev, bus_addr, prev_size, prev_dir); - bus_addr = 0; - } - - if (!bus_addr) /* need to map it */ -- bus_addr = pci_map_single(NULL, addr, size, dir); -+ bus_addr = pci_map_single(ppc64_isabridge_dev, addr, size, dir); - - /* remember this one as prev */ - prev_addr = addr; -diff -upr linux-2.6.16.orig/include/asm-powerpc/pgalloc.h linux-2.6.16-026test009/include/asm-powerpc/pgalloc.h ---- linux-2.6.16.orig/include/asm-powerpc/pgalloc.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-powerpc/pgalloc.h 2006-04-19 15:02:11.000000000 +0400 -@@ -33,7 +33,8 @@ extern kmem_cache_t *pgtable_cache[]; - - static inline pgd_t *pgd_alloc(struct mm_struct *mm) - { -- return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); -+ return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], -+ GFP_KERNEL_UBC | __GFP_SOFT_UBC); - } - - static inline void pgd_free(pgd_t *pgd) -@@ -48,7 +49,7 @@ static inline void pgd_free(pgd_t *pgd) - static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) - { - return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], -- GFP_KERNEL|__GFP_REPEAT); -+ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); - } - - static inline void pud_free(pud_t *pud) -@@ -84,7 +85,7 @@ static inline void pmd_populate_kernel(s - static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) - { - return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], -- GFP_KERNEL|__GFP_REPEAT); -+ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); - } - - static inline void pmd_free(pmd_t *pmd) -@@ -92,17 +93,21 @@ static inline void pmd_free(pmd_t *pmd) - kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); - } - -+static inline pte_t *__pte_alloc(gfp_t flags) -+{ -+ return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], flags); -+} -+ - static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) - { -- return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], -- GFP_KERNEL|__GFP_REPEAT); -+ return __pte_alloc(GFP_KERNEL | __GFP_REPEAT); - } - - static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) - { -- return virt_to_page(pte_alloc_one_kernel(mm, address)); -+ return virt_to_page(__pte_alloc(GFP_KERNEL_UBC | __GFP_SOFT_UBC)); - } - - static inline void pte_free_kernel(pte_t *pte) -diff -upr linux-2.6.16.orig/include/asm-powerpc/unistd.h linux-2.6.16-026test009/include/asm-powerpc/unistd.h ---- linux-2.6.16.orig/include/asm-powerpc/unistd.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-powerpc/unistd.h 2006-04-19 15:02:11.000000000 +0400 -@@ -301,8 +301,12 @@ - #define __NR_pselect6 280 - #define __NR_ppoll 281 - #define __NR_unshare 282 -- --#define __NR_syscalls 283 -+#define __NR_getluid 410 -+#define __NR_setluid 411 -+#define __NR_setublimit 412 -+#define __NR_ubstat 413 -+ -+#define NR_syscalls 414 - - #ifdef __KERNEL__ - #define __NR__exit __NR_exit -diff -upr linux-2.6.16.orig/include/asm-s390/pgalloc.h linux-2.6.16-026test009/include/asm-s390/pgalloc.h ---- linux-2.6.16.orig/include/asm-s390/pgalloc.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-s390/pgalloc.h 2006-04-19 15:02:11.000000000 +0400 -@@ -34,12 +34,12 @@ static inline pgd_t *pgd_alloc(struct mm - int i; - - #ifndef __s390x__ -- pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,1); -+ pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 1); - if (pgd != NULL) - for (i = 0; i < USER_PTRS_PER_PGD; i++) - pmd_clear(pmd_offset(pgd + i, i*PGDIR_SIZE)); - #else /* __s390x__ */ -- pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,2); -+ pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2); - if (pgd != NULL) - for (i = 0; i < PTRS_PER_PGD; i++) - pgd_clear(pgd + i); -@@ -72,7 +72,7 @@ static inline pmd_t * pmd_alloc_one(stru - pmd_t *pmd; - int i; - -- pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 2); -+ pmd = (pmd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2); - if (pmd != NULL) { - for (i=0; i < PTRS_PER_PMD; i++) - pmd_clear(pmd+i); -@@ -118,16 +118,13 @@ pmd_populate(struct mm_struct *mm, pmd_t - pmd_populate_kernel(mm, pmd, (pte_t *)((page-mem_map) << PAGE_SHIFT)); - } - --/* -- * page table entry allocation/free routines. -- */ --static inline pte_t * --pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) -+static inline pte_t *pte_alloc(struct mm_struct *mm, unsigned long vmaddr, -+ gfp_t mask) - { - pte_t *pte; - int i; - -- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); -+ pte = (pte_t *)__get_free_page(mask); - if (pte != NULL) { - for (i=0; i < PTRS_PER_PTE; i++) { - pte_clear(mm, vmaddr, pte+i); -@@ -137,10 +134,20 @@ pte_alloc_one_kernel(struct mm_struct *m - return pte; - } - -+/* -+ * page table entry allocation/free routines. -+ */ -+static inline pte_t * -+pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) -+{ -+ return pte_alloc(mm, vmaddr, GFP_KERNEL | __GFP_REPEAT); -+} -+ - static inline struct page * - pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr) - { -- pte_t *pte = pte_alloc_one_kernel(mm, vmaddr); -+ pte_t *pte = pte_alloc(mm, vmaddr, GFP_KERNEL_UBC | __GFP_SOFT_UBC | -+ __GFP_REPEAT); - if (pte) - return virt_to_page(pte); - return 0; -diff -upr linux-2.6.16.orig/include/asm-sh64/pgalloc.h linux-2.6.16-026test009/include/asm-sh64/pgalloc.h ---- linux-2.6.16.orig/include/asm-sh64/pgalloc.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-sh64/pgalloc.h 2006-04-19 15:02:12.000000000 +0400 -@@ -173,7 +173,7 @@ static inline void set_pgdir(unsigned lo - pgd_t *pgd; - - read_lock(&tasklist_lock); -- for_each_process(p) { -+ for_each_process_all(p) { - if (!p->mm) - continue; - *pgd_offset(p->mm,address) = entry; -diff -upr linux-2.6.16.orig/include/asm-x86_64/mman.h linux-2.6.16-026test009/include/asm-x86_64/mman.h ---- linux-2.6.16.orig/include/asm-x86_64/mman.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-x86_64/mman.h 2006-04-19 15:02:11.000000000 +0400 -@@ -12,6 +12,7 @@ - #define MAP_NORESERVE 0x4000 /* don't check for reservations */ - #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ - #define MAP_NONBLOCK 0x10000 /* do not block on IO */ -+#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ - - #define MCL_CURRENT 1 /* lock all current mappings */ - #define MCL_FUTURE 2 /* lock all future mappings */ -diff -upr linux-2.6.16.orig/include/asm-x86_64/nmi.h linux-2.6.16-026test009/include/asm-x86_64/nmi.h ---- linux-2.6.16.orig/include/asm-x86_64/nmi.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-x86_64/nmi.h 2006-04-19 15:02:11.000000000 +0400 -@@ -24,6 +24,9 @@ void set_nmi_callback(nmi_callback_t cal - * Remove the handler previously set. - */ - void unset_nmi_callback(void); -+ -+void set_nmi_ipi_callback(nmi_callback_t callback); -+void unset_nmi_ipi_callback(void); - - #ifdef CONFIG_PM - -diff -upr linux-2.6.16.orig/include/asm-x86_64/pgalloc.h linux-2.6.16-026test009/include/asm-x86_64/pgalloc.h ---- linux-2.6.16.orig/include/asm-x86_64/pgalloc.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-x86_64/pgalloc.h 2006-04-19 15:02:11.000000000 +0400 -@@ -31,12 +31,14 @@ static inline void pmd_free(pmd_t *pmd) - - static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) - { -- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); -+ return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| -+ __GFP_SOFT_UBC); - } - - static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) - { -- return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); -+ return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| -+ __GFP_SOFT_UBC); - } - - static inline void pud_free (pud_t *pud) -@@ -48,7 +50,8 @@ static inline void pud_free (pud_t *pud) - static inline pgd_t *pgd_alloc(struct mm_struct *mm) - { - unsigned boundary; -- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); -+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC|__GFP_REPEAT| -+ __GFP_SOFT_UBC); - if (!pgd) - return NULL; - /* -@@ -77,7 +80,8 @@ static inline pte_t *pte_alloc_one_kerne - - static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) - { -- void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); -+ void *p = (void *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| -+ __GFP_SOFT_UBC); - if (!p) - return NULL; - return virt_to_page(p); -diff -upr linux-2.6.16.orig/include/asm-x86_64/processor.h linux-2.6.16-026test009/include/asm-x86_64/processor.h ---- linux-2.6.16.orig/include/asm-x86_64/processor.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-x86_64/processor.h 2006-04-19 15:02:12.000000000 +0400 -@@ -167,7 +167,7 @@ static inline void clear_in_cr4 (unsigne - /* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ --#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) -+#define IA32_PAGE_OFFSET 0xc0000000 - - #define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64) - #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) -diff -upr linux-2.6.16.orig/include/asm-x86_64/segment.h linux-2.6.16-026test009/include/asm-x86_64/segment.h ---- linux-2.6.16.orig/include/asm-x86_64/segment.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-x86_64/segment.h 2006-04-19 15:02:12.000000000 +0400 -@@ -3,29 +3,28 @@ - - #include <asm/cache.h> - --#define __KERNEL_CS 0x10 --#define __KERNEL_DS 0x18 -- --#define __KERNEL32_CS 0x38 -- -+#define GDT_ENTRY_BOOT_CS 2 -+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) -+#define GDT_ENTRY_BOOT_DS 3 -+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) -+#define GDT_ENTRY_TSS 4 /* needs two entries */ - /* - * we cannot use the same code segment descriptor for user and kernel - * -- not even in the long flat mode, because of different DPL /kkeil - * The segment offset needs to contain a RPL. Grr. -AK - * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) - */ -+#define GDT_ENTRY_TLS_MIN 6 -+#define GDT_ENTRY_TLS_MAX 8 - --#define __USER32_CS 0x23 /* 4*8+3 */ --#define __USER_DS 0x2b /* 5*8+3 */ --#define __USER_CS 0x33 /* 6*8+3 */ -+#define GDT_ENTRY_LDT 9 /* needs two entries */ -+#define __KERNEL32_CS 0x58 /* 11*8 */ -+#define __KERNEL_CS 0x60 /* 12*8 */ -+#define __KERNEL_DS 0x68 /* 13*8 */ -+#define __USER32_CS 0x73 /* 14*8+3 */ -+#define __USER_DS 0x7b /* 15*8+3 */ - #define __USER32_DS __USER_DS -- --#define GDT_ENTRY_TLS 1 --#define GDT_ENTRY_TSS 8 /* needs two entries */ --#define GDT_ENTRY_LDT 10 /* needs two entries */ --#define GDT_ENTRY_TLS_MIN 12 --#define GDT_ENTRY_TLS_MAX 14 --/* 15 free */ -+#define __USER_CS 0x83 /* 16*8+3 */ - - #define GDT_ENTRY_TLS_ENTRIES 3 - -@@ -37,7 +36,7 @@ - #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) - - #define IDT_ENTRIES 256 --#define GDT_ENTRIES 16 -+#define GDT_ENTRIES 32 - #define GDT_SIZE (GDT_ENTRIES * 8) - #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) - -diff -upr linux-2.6.16.orig/include/asm-x86_64/signal.h linux-2.6.16-026test009/include/asm-x86_64/signal.h ---- linux-2.6.16.orig/include/asm-x86_64/signal.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-x86_64/signal.h 2006-04-19 15:02:12.000000000 +0400 -@@ -23,11 +23,6 @@ typedef struct { - unsigned long sig[_NSIG_WORDS]; - } sigset_t; - -- --struct pt_regs; --asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); -- -- - #else - /* Here we must cater to libcs that poke about in kernel headers. */ - -diff -upr linux-2.6.16.orig/include/asm-x86_64/thread_info.h linux-2.6.16-026test009/include/asm-x86_64/thread_info.h ---- linux-2.6.16.orig/include/asm-x86_64/thread_info.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-x86_64/thread_info.h 2006-04-19 15:02:12.000000000 +0400 -@@ -74,7 +74,7 @@ static inline struct thread_info *stack_ - - /* thread information allocation */ - #define alloc_thread_info(tsk) \ -- ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)) -+ ((struct thread_info *) __get_free_pages(GFP_KERNEL_UBC,THREAD_ORDER)) - #define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER) - - #else /* !__ASSEMBLY__ */ -@@ -101,11 +101,13 @@ static inline struct thread_info *stack_ - #define TIF_IRET 5 /* force IRET */ - #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ - #define TIF_SECCOMP 8 /* secure computing */ -+#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ - #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ - #define TIF_IA32 17 /* 32bit process */ - #define TIF_FORK 18 /* ret_from_fork */ - #define TIF_ABI_PENDING 19 --#define TIF_MEMDIE 20 -+#define TIF_FREEZE 20 -+#define TIF_MEMDIE 21 - - #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) - #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) -@@ -115,6 +117,7 @@ static inline struct thread_info *stack_ - #define _TIF_IRET (1<<TIF_IRET) - #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) - #define _TIF_SECCOMP (1<<TIF_SECCOMP) -+#define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) - #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) - #define _TIF_IA32 (1<<TIF_IA32) - #define _TIF_FORK (1<<TIF_FORK) -diff -upr linux-2.6.16.orig/include/asm-x86_64/unistd.h linux-2.6.16-026test009/include/asm-x86_64/unistd.h ---- linux-2.6.16.orig/include/asm-x86_64/unistd.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/asm-x86_64/unistd.h 2006-04-19 15:02:12.000000000 +0400 -@@ -605,8 +605,16 @@ __SYSCALL(__NR_pselect6, sys_ni_syscall) - __SYSCALL(__NR_ppoll, sys_ni_syscall) /* for now */ - #define __NR_unshare 272 - __SYSCALL(__NR_unshare, sys_unshare) -- --#define __NR_syscall_max __NR_unshare -+#define __NR_getluid 500 -+__SYSCALL(__NR_getluid, sys_getluid) -+#define __NR_setluid 501 -+__SYSCALL(__NR_setluid, sys_setluid) -+#define __NR_setublimit 502 -+__SYSCALL(__NR_setublimit, sys_setublimit) -+#define __NR_ubstat 503 -+__SYSCALL(__NR_ubstat, sys_ubstat) -+ -+#define __NR_syscall_max __NR_ubstat - - #ifndef __NO_STUBS - -@@ -645,6 +653,7 @@ do { \ - #define __ARCH_WANT_SYS_RT_SIGACTION - #define __ARCH_WANT_SYS_TIME - #define __ARCH_WANT_COMPAT_SYS_TIME -+#define __ARCH_WANT_SYS_RT_SIGSUSPEND - #endif - - #ifndef __KERNEL_SYSCALLS__ -diff -upr linux-2.6.16.orig/include/linux/aio.h linux-2.6.16-026test009/include/linux/aio.h ---- linux-2.6.16.orig/include/linux/aio.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/aio.h 2006-04-19 15:02:12.000000000 +0400 -@@ -247,4 +247,8 @@ static inline struct kiocb *list_kiocb(s - extern unsigned long aio_nr; - extern unsigned long aio_max_nr; - -+void wait_for_all_aios(struct kioctx *ctx); -+extern kmem_cache_t *kioctx_cachep; -+extern void aio_kick_handler(void *); -+ - #endif /* __LINUX__AIO_H */ -diff -upr linux-2.6.16.orig/include/linux/binfmts.h linux-2.6.16-026test009/include/linux/binfmts.h ---- linux-2.6.16.orig/include/linux/binfmts.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/binfmts.h 2006-04-19 15:02:11.000000000 +0400 -@@ -2,6 +2,7 @@ - #define _LINUX_BINFMTS_H - - #include <linux/capability.h> -+#include <linux/fs.h> - - struct pt_regs; - -@@ -28,6 +29,7 @@ struct linux_binprm{ - int sh_bang; - struct file * file; - int e_uid, e_gid; -+ struct exec_perm perm; - kernel_cap_t cap_inheritable, cap_permitted, cap_effective; - void *security; - int argc, envc; -diff -upr linux-2.6.16.orig/include/linux/capability.h linux-2.6.16-026test009/include/linux/capability.h ---- linux-2.6.16.orig/include/linux/capability.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/capability.h 2006-04-19 15:02:12.000000000 +0400 -@@ -146,12 +146,9 @@ typedef __u32 kernel_cap_t; - - #define CAP_NET_BROADCAST 11 - --/* Allow interface configuration */ - /* Allow administration of IP firewall, masquerading and accounting */ - /* Allow setting debug option on sockets */ - /* Allow modification of routing tables */ --/* Allow setting arbitrary process / process group ownership on -- sockets */ - /* Allow binding to any address for transparent proxying */ - /* Allow setting TOS (type of service) */ - /* Allow setting promiscuous mode */ -@@ -200,24 +197,19 @@ typedef __u32 kernel_cap_t; - - /* Allow configuration of the secure attention key */ - /* Allow administration of the random device */ --/* Allow examination and configuration of disk quotas */ - /* Allow configuring the kernel's syslog (printk behaviour) */ - /* Allow setting the domainname */ - /* Allow setting the hostname */ - /* Allow calling bdflush() */ --/* Allow mount() and umount(), setting up new smb connection */ -+/* Allow setting up new smb connection */ - /* Allow some autofs root ioctls */ - /* Allow nfsservctl */ - /* Allow VM86_REQUEST_IRQ */ - /* Allow to read/write pci config on alpha */ - /* Allow irix_prctl on mips (setstacksize) */ - /* Allow flushing all cache on m68k (sys_cacheflush) */ --/* Allow removing semaphores */ --/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores -- and shared memory */ - /* Allow locking/unlocking of shared memory segment */ - /* Allow turning swap on/off */ --/* Allow forged pids on socket credentials passing */ - /* Allow setting readahead and flushing buffers on block devices */ - /* Allow setting geometry in floppy driver */ - /* Allow turning DMA on/off in xd driver */ -@@ -288,7 +280,52 @@ typedef __u32 kernel_cap_t; - - #define CAP_AUDIT_CONTROL 30 - -+/* -+ * Important note: VZ capabilities do intersect with CAP_AUDIT -+ * this is due to compatibility reasons. Nothing bad. -+ * Both VZ and Audit/SELinux caps are disabled in VPSs. -+ */ -+ -+/* Allow access to all information. In the other case some structures will be -+ hiding to ensure different Virtual Environment non-interaction on the same -+ node */ -+#define CAP_SETVEID 29 -+ -+#define CAP_VE_ADMIN 30 -+ - #ifdef __KERNEL__ -+ -+#include <linux/config.h> -+ -+#ifdef CONFIG_VE -+ -+/* Replacement for CAP_NET_ADMIN: -+ delegated rights to the Virtual environment of its network administration. -+ For now the following rights have been delegated: -+ -+ Allow setting arbitrary process / process group ownership on sockets -+ Allow interface configuration -+ */ -+#define CAP_VE_NET_ADMIN CAP_VE_ADMIN -+ -+/* Replacement for CAP_SYS_ADMIN: -+ delegated rights to the Virtual environment of its administration. -+ For now the following rights have been delegated: -+ */ -+/* Allow mount/umount/remount */ -+/* Allow examination and configuration of disk quotas */ -+/* Allow removing semaphores */ -+/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores -+ and shared memory */ -+/* Allow locking/unlocking of shared memory segment */ -+/* Allow forged pids on socket credentials passing */ -+ -+#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN -+#else -+#define CAP_VE_NET_ADMIN CAP_NET_ADMIN -+#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN -+#endif -+ - /* - * Bounding set - */ -@@ -352,9 +389,14 @@ static inline kernel_cap_t cap_invert(ke - #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set))) - - #define cap_clear(c) do { cap_t(c) = 0; } while(0) -+#ifndef CONFIG_VE - #define cap_set_full(c) do { cap_t(c) = ~0; } while(0) -+#else -+#define cap_set_full(c) \ -+ do {cap_t(c) = ve_is_super(get_exec_env()) ? ~0 : \ -+ get_exec_env()->cap_default; } while(0) -+#endif - #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0) -- - #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK) - - extern int capable(int cap); -diff -upr linux-2.6.16.orig/include/linux/coda_linux.h linux-2.6.16-026test009/include/linux/coda_linux.h ---- linux-2.6.16.orig/include/linux/coda_linux.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/coda_linux.h 2006-04-19 15:02:11.000000000 +0400 -@@ -38,7 +38,8 @@ extern struct file_operations coda_ioctl - int coda_open(struct inode *i, struct file *f); - int coda_flush(struct file *f); - int coda_release(struct inode *i, struct file *f); --int coda_permission(struct inode *inode, int mask, struct nameidata *nd); -+int coda_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *); - int coda_revalidate_inode(struct dentry *); - int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); - int coda_setattr(struct dentry *, struct iattr *); -diff -upr linux-2.6.16.orig/include/linux/compat.h linux-2.6.16-026test009/include/linux/compat.h ---- linux-2.6.16.orig/include/linux/compat.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/compat.h 2006-04-19 15:02:12.000000000 +0400 -@@ -181,5 +181,7 @@ static inline int compat_timespec_compar - return lhs->tv_nsec - rhs->tv_nsec; - } - -+extern long compat_nanosleep_restart(struct restart_block *restart); -+ - #endif /* CONFIG_COMPAT */ - #endif /* _LINUX_COMPAT_H */ -diff -upr linux-2.6.16.orig/include/linux/cpt_image.h linux-2.6.16-026test009/include/linux/cpt_image.h ---- linux-2.6.16.orig/include/linux/cpt_image.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/cpt_image.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,1427 @@ -+/* -+ * -+ * include/linux/cpt_image.h -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __CPT_IMAGE_H_ -+#define __CPT_IMAGE_H_ 1 -+ -+#define CPT_NULL (~0ULL) -+#define CPT_NOINDEX (~0U) -+ -+/* -+ * Image file layout. -+ * -+ * - major header -+ * - sections[] -+ * -+ * Each section is: -+ * - section header -+ * - array of objects -+ * -+ * All data records are arch independent, 64 bit aligned. -+ */ -+ -+enum _cpt_object_type -+{ -+ CPT_OBJ_TASK = 0, -+ CPT_OBJ_MM, -+ CPT_OBJ_FS, -+ CPT_OBJ_FILES, -+ CPT_OBJ_FILE, -+ CPT_OBJ_SIGHAND_STRUCT, -+ CPT_OBJ_SIGNAL_STRUCT, -+ CPT_OBJ_TTY, -+ CPT_OBJ_SOCKET, -+ CPT_OBJ_SYSVSEM_UNDO, -+ CPT_OBJ_NAMESPACE, -+ CPT_OBJ_SYSV_SHM, -+ CPT_OBJ_INODE, -+ CPT_OBJ_UBC, -+ CPT_OBJ_SLM_SGREG, -+ CPT_OBJ_SLM_REGOBJ, -+ CPT_OBJ_SLM_MM, -+ CPT_OBJ_MAX, -+ /* The objects above are stored in memory while checkpointing */ -+ -+ CPT_OBJ_VMA = 1024, -+ CPT_OBJ_FILEDESC, -+ CPT_OBJ_SIGHANDLER, -+ CPT_OBJ_SIGINFO, -+ CPT_OBJ_LASTSIGINFO, -+ CPT_OBJ_SYSV_SEM, -+ CPT_OBJ_SKB, -+ CPT_OBJ_FLOCK, -+ CPT_OBJ_OPENREQ, -+ CPT_OBJ_VFSMOUNT, -+ CPT_OBJ_TRAILER, -+ CPT_OBJ_SYSVSEM_UNDO_REC, -+ CPT_OBJ_NET_DEVICE, -+ CPT_OBJ_NET_IFADDR, -+ CPT_OBJ_NET_ROUTE, -+ CPT_OBJ_NET_CONNTRACK, -+ CPT_OBJ_NET_CONNTRACK_EXPECT, -+ CPT_OBJ_AIO_CONTEXT, -+ CPT_OBJ_VEINFO, -+ CPT_OBJ_EPOLL, -+ CPT_OBJ_EPOLL_FILE, -+ -+ CPT_OBJ_X86_REGS = 4096, -+ CPT_OBJ_X86_64_REGS, -+ CPT_OBJ_PAGES, -+ CPT_OBJ_COPYPAGES, -+ CPT_OBJ_REMAPPAGES, -+ CPT_OBJ_LAZYPAGES, -+ CPT_OBJ_NAME, -+ CPT_OBJ_BITS, -+ CPT_OBJ_REF, -+}; -+ -+#define CPT_ALIGN(n) (((n)+7)&~7) -+ -+struct cpt_major_hdr -+{ -+ __u8 cpt_signature[4]; /* Magic number */ -+ __u16 cpt_hdrlen; /* Length of this header */ -+ __u16 cpt_image_version; /* Format of this file; mbz */ -+ __u16 cpt_os_arch; /* Architecture */ -+#define CPT_OS_ARCH_I386 0 -+#define CPT_OS_ARCH_EMT64 1 -+#define CPT_OS_ARCH_IA64 2 -+ __u16 __cpt_pad1; -+ __u32 cpt_os_version; /* Version of kernel, where image was done */ -+ __u32 cpt_os_features; /* Kernel features: SMP etc. */ -+ __u16 cpt_pagesize; /* Page size used by OS */ -+ __u16 cpt_hz; /* HZ used by OS */ -+ __u64 cpt_start_jiffies64; /* Jiffies */ -+ __u32 cpt_start_sec; /* Seconds */ -+ __u32 cpt_start_nsec; /* Nanoseconds */ -+ __u32 cpt_cpu_caps[4]; /* CPU capabilities */ -+ __u32 cpt_kernel_config[4]; /* Kernel config */ -+ __u64 cpt_iptables_mask; /* Used netfilter modules */ -+} __attribute__ ((aligned (8))); -+ -+#define CPT_SIGNATURE0 0x79 -+#define CPT_SIGNATURE1 0x1c -+#define CPT_SIGNATURE2 0x01 -+#define CPT_SIGNATURE3 0x63 -+ -+#define CPT_CPU_X86_CMOV 0 -+#define CPT_CPU_X86_FXSR 1 -+#define CPT_CPU_X86_SSE 2 -+#define CPT_CPU_X86_SSE2 3 -+#define CPT_CPU_X86_MMX 4 -+#define CPT_CPU_X86_3DNOW 5 -+#define CPT_CPU_X86_3DNOW2 6 -+#define CPT_CPU_X86_SEP 7 -+#define CPT_CPU_X86_EMT64 8 -+#define CPT_CPU_X86_IA64 9 -+ -+#define CPT_KERNEL_CONFIG_PAE 0 -+ -+struct cpt_section_hdr -+{ -+ __u64 cpt_next; -+ __u32 cpt_section; -+ __u16 cpt_hdrlen; -+ __u16 cpt_align; -+} __attribute__ ((aligned (8))); -+ -+enum -+{ -+ CPT_SECT_ERROR, /* Error section, content is string */ -+ CPT_SECT_VEINFO, -+ CPT_SECT_FILES, /* Files. Content is array of file objects */ -+ CPT_SECT_TASKS, -+ CPT_SECT_MM, -+ CPT_SECT_FILES_STRUCT, -+ CPT_SECT_FS, -+ CPT_SECT_SIGHAND_STRUCT, -+ CPT_SECT_TTY, -+ CPT_SECT_SOCKET, -+ CPT_SECT_NAMESPACE, -+ CPT_SECT_SYSVSEM_UNDO, -+ CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and -+ * deleted dentires with inodes not -+ * referenced inside dumped process. -+ */ -+ CPT_SECT_SYSV_SHM, -+ CPT_SECT_SYSV_SEM, -+ CPT_SECT_ORPHANS, -+ CPT_SECT_NET_DEVICE, -+ CPT_SECT_NET_IFADDR, -+ CPT_SECT_NET_ROUTE, -+ CPT_SECT_NET_IPTABLES, -+ CPT_SECT_NET_CONNTRACK, -+ CPT_SECT_NET_CONNTRACK_VE0, -+ CPT_SECT_UTSNAME, -+ CPT_SECT_TRAILER, -+ CPT_SECT_UBC, -+ CPT_SECT_SLM_SGREGS, -+ CPT_SECT_SLM_REGOBJS, -+/* Due to silly mistake we cannot index sections beyond this value */ -+#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1) -+ CPT_SECT_EPOLL, -+ CPT_SECT_MAX -+}; -+ -+struct cpt_major_tail -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_lazypages; -+ __u32 cpt_64bit; -+ __u64 cpt_sections[CPT_SECT_MAX_INDEX]; -+ __u32 cpt_nsect; -+ __u8 cpt_signature[4]; /* Magic number */ -+} __attribute__ ((aligned (8))); -+ -+ -+/* Common object header. */ -+struct cpt_object_hdr -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+} __attribute__ ((aligned (8))); -+ -+enum _cpt_content_type { -+ CPT_CONTENT_VOID, -+ CPT_CONTENT_ARRAY, -+ CPT_CONTENT_DATA, -+ CPT_CONTENT_NAME, -+ -+ CPT_CONTENT_STACK, -+ CPT_CONTENT_X86_FPUSTATE_OLD, -+ CPT_CONTENT_X86_FPUSTATE, -+ CPT_CONTENT_MM_CONTEXT, -+ CPT_CONTENT_SEMARRAY, -+ CPT_CONTENT_SEMUNDO, -+ CPT_CONTENT_NLMARRAY, -+ CPT_CONTENT_MAX -+}; -+ -+/* CPT_OBJ_BITS: encode array of bytes */ -+struct cpt_obj_bits -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_size; -+ __u32 __cpt_pad1; -+} __attribute__ ((aligned (8))); -+ -+/* CPT_OBJ_REF: a reference to another object */ -+struct cpt_obj_ref -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_pos; -+} __attribute__ ((aligned (8))); -+ -+/* CPT_OBJ_VEINFO: various ve specific data */ -+struct cpt_veinfo_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ /* ipc ctls */ -+ __u32 shm_ctl_max; -+ __u32 shm_ctl_all; -+ __u32 shm_ctl_mni; -+ __u32 msg_ctl_max; -+ __u32 msg_ctl_mni; -+ __u32 msg_ctl_mnb; -+ __u32 sem_ctl_arr[4]; -+ -+ /* start time */ -+ __u64 start_timespec_delta; -+ __u64 start_jiffies_delta; -+} __attribute__ ((aligned (8))); -+ -+/* CPT_OBJ_FILE: one struct file */ -+struct cpt_file_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_flags; -+ __u32 cpt_mode; -+ __u64 cpt_pos; -+ __u32 cpt_uid; -+ __u32 cpt_gid; -+ -+ __u32 cpt_i_mode; -+ __u32 cpt_lflags; -+#define CPT_DENTRY_DELETED 1 -+#define CPT_DENTRY_ROOT 2 -+#define CPT_DENTRY_CLONING 4 -+#define CPT_DENTRY_PROC 8 -+#define CPT_DENTRY_EPOLL 0x10 -+ __u64 cpt_inode; -+ __u64 cpt_priv; -+ -+ __u32 cpt_fown_fd; -+ __u32 cpt_fown_pid; -+ __u32 cpt_fown_uid; -+ __u32 cpt_fown_euid; -+ __u32 cpt_fown_signo; -+ __u32 __cpt_pad1; -+} __attribute__ ((aligned (8))); -+/* Followed by file name, encoded as CPT_OBJ_NAME */ -+ -+struct cpt_epoll_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_file; -+} __attribute__ ((aligned (8))); -+/* Followed by array of struct cpt_epoll_file */ -+ -+struct cpt_epoll_file_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_file; -+ __u32 cpt_fd; -+ __u32 cpt_events; -+ __u64 cpt_data; -+ __u32 cpt_revents; -+ __u32 cpt_ready; -+} __attribute__ ((aligned (8))); -+ -+ -+/* CPT_OBJ_FILEDESC: one file descriptor */ -+struct cpt_fd_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_fd; -+ __u32 cpt_flags; -+#define CPT_FD_FLAG_CLOSEEXEC 1 -+ __u64 cpt_file; -+} __attribute__ ((aligned (8))); -+ -+/* CPT_OBJ_FILES: one files_struct */ -+struct cpt_files_struct_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_index; -+ __u32 cpt_max_fds; -+ __u32 cpt_next_fd; -+ __u32 __cpt_pad1; -+} __attribute__ ((aligned (8))); -+/* Followed by array of cpt_fd_image */ -+ -+/* CPT_OBJ_FS: one fs_struct */ -+struct cpt_fs_struct_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_umask; -+ __u32 __cpt_pad1; -+} __attribute__ ((aligned (8))); -+/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */ -+ -+/* CPT_OBJ_INODE: one struct inode */ -+struct cpt_inode_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_dev; -+ __u64 cpt_ino; -+ __u32 cpt_mode; -+ __u32 cpt_nlink; -+ __u32 cpt_uid; -+ __u32 cpt_gid; -+ __u64 cpt_rdev; -+ __u64 cpt_size; -+ __u64 cpt_blksize; -+ __u64 cpt_atime; -+ __u64 cpt_mtime; -+ __u64 cpt_ctime; -+ __u64 cpt_blocks; -+ __u32 cpt_sb; -+ __u32 __cpt_pad1; -+} __attribute__ ((aligned (8))); -+ -+/* CPT_OBJ_VFSMOUNT: one vfsmount */ -+struct cpt_vfsmount_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_mntflags; -+ __u32 cpt_flags; -+} __attribute__ ((aligned (8))); -+ -+ -+struct cpt_flock_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_owner; -+ __u32 cpt_pid; -+ __u64 cpt_start; -+ __u64 cpt_end; -+ __u32 cpt_flags; -+ __u32 cpt_type; -+} __attribute__ ((aligned (8))); -+ -+ -+struct cpt_tty_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_flags; -+ __u32 cpt_link; -+ __u32 cpt_index; -+ __u32 cpt_drv_type; -+ __u32 cpt_drv_subtype; -+ __u32 cpt_drv_flags; -+ __u8 cpt_packet; -+ __u8 cpt_stopped; -+ __u8 cpt_hw_stopped; -+ __u8 cpt_flow_stopped; -+ -+ __u32 cpt_canon_data; -+ __u32 cpt_canon_head; -+ __u32 cpt_canon_column; -+ __u32 cpt_column; -+ __u8 cpt_ctrl_status; -+ __u8 cpt_erasing; -+ __u8 cpt_lnext; -+ __u8 cpt_icanon; -+ __u8 cpt_raw; -+ __u8 cpt_real_raw; -+ __u8 cpt_closing; -+ __u8 __cpt_pad1; -+ __u16 cpt_minimum_to_wake; -+ __u16 __cpt_pad2; -+ __u32 cpt_pgrp; -+ __u32 cpt_session; -+ __u32 cpt_c_line; -+ __u8 cpt_name[64]; -+ __u16 cpt_ws_row; -+ __u16 cpt_ws_col; -+ __u16 cpt_ws_prow; -+ __u16 cpt_ws_pcol; -+ __u8 cpt_c_cc[32]; -+ __u32 cpt_c_iflag; -+ __u32 cpt_c_oflag; -+ __u32 cpt_c_cflag; -+ __u32 cpt_c_lflag; -+ __u32 cpt_read_flags[4096/32]; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_sock_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_file; -+ __u32 cpt_parent; -+ __u32 cpt_index; -+ -+ __u64 cpt_ssflags; -+ __u16 cpt_type; -+ __u16 cpt_family; -+ __u8 cpt_sstate; -+ __u8 cpt_passcred; -+ __u8 cpt_state; -+ __u8 cpt_reuse; -+ -+ __u8 cpt_zapped; -+ __u8 cpt_shutdown; -+ __u8 cpt_userlocks; -+ __u8 cpt_no_check; -+ __u8 cpt_debug; -+ __u8 cpt_rcvtstamp; -+ __u8 cpt_localroute; -+ __u8 cpt_protocol; -+ -+ __u32 cpt_err; -+ __u32 cpt_err_soft; -+ -+ __u16 cpt_max_ack_backlog; -+ __u16 __cpt_pad1; -+ __u32 cpt_priority; -+ -+ __u32 cpt_rcvlowat; -+ __u32 cpt_bound_dev_if; -+ -+ __u64 cpt_rcvtimeo; -+ __u64 cpt_sndtimeo; -+ __u32 cpt_rcvbuf; -+ __u32 cpt_sndbuf; -+ __u64 cpt_flags; -+ __u64 cpt_lingertime; -+ __u32 cpt_peer_pid; -+ __u32 cpt_peer_uid; -+ -+ __u32 cpt_peer_gid; -+ __u32 cpt_laddrlen; -+ __u32 cpt_laddr[128/4]; -+ __u32 cpt_raddrlen; -+ __u32 cpt_raddr[128/4]; -+ /* AF_UNIX */ -+ __u32 cpt_peer; -+ -+ __u8 cpt_socketpair; -+ __u8 cpt_deleted; -+ __u16 __cpt_pad4; -+ __u32 __cpt_pad5; -+/* -+ struct sk_filter *sk_filter; -+ */ -+ -+ __u64 cpt_stamp; -+ __u32 cpt_daddr; -+ __u16 cpt_dport; -+ __u16 cpt_sport; -+ -+ __u32 cpt_saddr; -+ __u32 cpt_rcv_saddr; -+ -+ __u32 cpt_uc_ttl; -+ __u32 cpt_tos; -+ -+ __u32 cpt_cmsg_flags; -+ __u32 cpt_mc_index; -+ -+ __u32 cpt_mc_addr; -+/* -+ struct ip_options *opt; -+ struct ip_mc_socklist *mc_list; -+ */ -+ __u8 cpt_hdrincl; -+ __u8 cpt_mc_ttl; -+ __u8 cpt_mc_loop; -+ __u8 cpt_pmtudisc; -+ -+ __u8 cpt_recverr; -+ __u8 cpt_freebind; -+ __u16 cpt_idcounter; -+ __u32 cpt_cork_flags; -+ -+ __u32 cpt_cork_fragsize; -+ __u32 cpt_cork_length; -+ __u32 cpt_cork_addr; -+ __u32 cpt_cork_saddr; -+ __u32 cpt_cork_daddr; -+ __u32 cpt_cork_oif; -+ -+ __u32 cpt_udp_pending; -+ __u32 cpt_udp_corkflag; -+ __u16 cpt_udp_encap; -+ __u16 cpt_udp_len; -+ __u32 __cpt_pad7; -+ -+ __u64 cpt_saddr6[2]; -+ __u64 cpt_rcv_saddr6[2]; -+ __u64 cpt_daddr6[2]; -+ __u32 cpt_flow_label6; -+ __u32 cpt_frag_size6; -+ __u32 cpt_hop_limit6; -+ __u32 cpt_mcast_hops6; -+ -+ __u32 cpt_mcast_oif6; -+ __u8 cpt_rxopt6; -+ __u8 cpt_mc_loop6; -+ __u8 cpt_recverr6; -+ __u8 cpt_sndflow6; -+ -+ __u8 cpt_pmtudisc6; -+ __u8 cpt_ipv6only6; -+ __u8 cpt_mapped; -+ __u8 __cpt_pad8; -+ __u32 cpt_pred_flags; -+ -+ __u32 cpt_rcv_nxt; -+ __u32 cpt_snd_nxt; -+ -+ __u32 cpt_snd_una; -+ __u32 cpt_snd_sml; -+ -+ __u32 cpt_rcv_tstamp; -+ __u32 cpt_lsndtime; -+ -+ __u8 cpt_tcp_header_len; -+ __u8 cpt_ack_pending; -+ __u8 cpt_quick; -+ __u8 cpt_pingpong; -+ __u8 cpt_blocked; -+ __u8 __cpt_pad9; -+ __u16 __cpt_pad10; -+ -+ __u32 cpt_ato; -+ __u32 cpt_ack_timeout; -+ -+ __u32 cpt_lrcvtime; -+ __u16 cpt_last_seg_size; -+ __u16 cpt_rcv_mss; -+ -+ __u32 cpt_snd_wl1; -+ __u32 cpt_snd_wnd; -+ -+ __u32 cpt_max_window; -+ __u32 cpt_pmtu_cookie; -+ -+ __u32 cpt_mss_cache; -+ __u16 cpt_mss_cache_std; -+ __u16 cpt_mss_clamp; -+ -+ __u16 cpt_ext_header_len; -+ __u16 cpt_ext2_header_len; -+ __u8 cpt_ca_state; -+ __u8 cpt_retransmits; -+ __u8 cpt_reordering; -+ __u8 cpt_frto_counter; -+ -+ __u32 cpt_frto_highmark; -+ __u8 cpt_adv_cong; -+ __u8 cpt_defer_accept; -+ __u8 cpt_backoff; -+ __u8 __cpt_pad11; -+ -+ __u32 cpt_srtt; -+ __u32 cpt_mdev; -+ -+ __u32 cpt_mdev_max; -+ __u32 cpt_rttvar; -+ -+ __u32 cpt_rtt_seq; -+ __u32 cpt_rto; -+ -+ __u32 cpt_packets_out; -+ __u32 cpt_left_out; -+ -+ __u32 cpt_retrans_out; -+ __u32 cpt_snd_ssthresh; -+ -+ __u32 cpt_snd_cwnd; -+ __u16 cpt_snd_cwnd_cnt; -+ __u16 cpt_snd_cwnd_clamp; -+ -+ __u32 cpt_snd_cwnd_used; -+ __u32 cpt_snd_cwnd_stamp; -+ -+ __u32 cpt_timeout; -+ __u32 cpt_ka_timeout; -+ -+ __u32 cpt_rcv_wnd; -+ __u32 cpt_rcv_wup; -+ -+ __u32 cpt_write_seq; -+ __u32 cpt_pushed_seq; -+ -+ __u32 cpt_copied_seq; -+ __u8 cpt_tstamp_ok; -+ __u8 cpt_wscale_ok; -+ __u8 cpt_sack_ok; -+ __u8 cpt_saw_tstamp; -+ -+ __u8 cpt_snd_wscale; -+ __u8 cpt_rcv_wscale; -+ __u8 cpt_nonagle; -+ __u8 cpt_keepalive_probes; -+ __u32 cpt_rcv_tsval; -+ -+ __u32 cpt_rcv_tsecr; -+ __u32 cpt_ts_recent; -+ -+ __u64 cpt_ts_recent_stamp; -+ __u16 cpt_user_mss; -+ __u8 cpt_dsack; -+ __u8 cpt_eff_sacks; -+ __u32 cpt_sack_array[2*5]; -+ __u32 cpt_window_clamp; -+ -+ __u32 cpt_rcv_ssthresh; -+ __u8 cpt_probes_out; -+ __u8 cpt_num_sacks; -+ __u16 cpt_advmss; -+ -+ __u8 cpt_syn_retries; -+ __u8 cpt_ecn_flags; -+ __u16 cpt_prior_ssthresh; -+ __u32 cpt_lost_out; -+ -+ __u32 cpt_sacked_out; -+ __u32 cpt_fackets_out; -+ -+ __u32 cpt_high_seq; -+ __u32 cpt_retrans_stamp; -+ -+ __u32 cpt_undo_marker; -+ __u32 cpt_undo_retrans; -+ -+ __u32 cpt_urg_seq; -+ __u16 cpt_urg_data; -+ __u8 cpt_pending; -+ __u8 cpt_urg_mode; -+ -+ __u32 cpt_snd_up; -+ __u32 cpt_keepalive_time; -+ -+ __u32 cpt_keepalive_intvl; -+ __u32 cpt_linger2; -+ -+ __u32 cpt_rcvrtt_rtt; -+ __u32 cpt_rcvrtt_seq; -+ -+ __u32 cpt_rcvrtt_time; -+ __u32 __cpt_pad12; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_openreq_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_rcv_isn; -+ __u32 cpt_snt_isn; -+ -+ __u16 cpt_rmt_port; -+ __u16 cpt_mss; -+ __u8 cpt_family; -+ __u8 cpt_retrans; -+ __u8 cpt_snd_wscale; -+ __u8 cpt_rcv_wscale; -+ -+ __u8 cpt_tstamp_ok; -+ __u8 cpt_sack_ok; -+ __u8 cpt_wscale_ok; -+ __u8 cpt_ecn_ok; -+ __u8 cpt_acked; -+ __u8 __cpt_pad1; -+ __u16 __cpt_pad2; -+ -+ __u32 cpt_window_clamp; -+ __u32 cpt_rcv_wnd; -+ __u32 cpt_ts_recent; -+ __u32 cpt_iif; -+ __u64 cpt_expires; -+ -+ __u64 cpt_loc_addr[2]; -+ __u64 cpt_rmt_addr[2]; -+/* -+ struct ip_options *opt; -+ */ -+ -+} __attribute__ ((aligned (8))); -+ -+struct cpt_skb_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_owner; -+ __u32 cpt_queue; -+#define CPT_SKB_NQ 0 -+#define CPT_SKB_RQ 1 -+#define CPT_SKB_WQ 2 -+#define CPT_SKB_OFOQ 3 -+ -+ __u64 cpt_stamp; -+ __u32 cpt_len; -+ __u32 cpt_hspace; -+ __u32 cpt_tspace; -+ __u32 cpt_h; -+ __u32 cpt_nh; -+ __u32 cpt_mac; -+ -+ __u64 cpt_cb[5]; -+ __u32 cpt_mac_len; -+ __u32 cpt_csum; -+ __u8 cpt_local_df; -+ __u8 cpt_pkt_type; -+ __u8 cpt_ip_summed; -+ __u8 __cpt_pad1; -+ __u32 cpt_priority; -+ __u16 cpt_protocol; -+ __u16 cpt_security; -+ __u16 cpt_tso_segs; -+ __u16 cpt_tso_size; -+} __attribute__ ((aligned (8))); -+ -+ -+struct cpt_sysvshm_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_key; -+ __u64 cpt_uid; -+ __u64 cpt_gid; -+ __u64 cpt_cuid; -+ __u64 cpt_cgid; -+ __u64 cpt_mode; -+ __u64 cpt_seq; -+ -+ __u32 cpt_id; -+ __u32 cpt_mlockuser; -+ __u64 cpt_segsz; -+ __u64 cpt_atime; -+ __u64 cpt_ctime; -+ __u64 cpt_dtime; -+ __u64 cpt_creator; -+ __u64 cpt_last; -+} __attribute__ ((aligned (8))); -+ -+ -+struct cpt_sysvsem_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_key; -+ __u64 cpt_uid; -+ __u64 cpt_gid; -+ __u64 cpt_cuid; -+ __u64 cpt_cgid; -+ __u64 cpt_mode; -+ __u64 cpt_seq; -+ __u32 cpt_id; -+ __u32 __cpt_pad1; -+ -+ __u64 cpt_otime; -+ __u64 cpt_ctime; -+} __attribute__ ((aligned (8))); -+/* Content is array of pairs semval/sempid */ -+ -+struct cpt_sysvsem_undo_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_id; -+ __u32 cpt_nsem; -+} __attribute__ ((aligned (8))); -+ -+ -+struct cpt_mm_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_start_code; -+ __u64 cpt_end_code; -+ __u64 cpt_start_data; -+ __u64 cpt_end_data; -+ __u64 cpt_start_brk; -+ __u64 cpt_brk; -+ __u64 cpt_start_stack; -+ __u64 cpt_start_arg; -+ __u64 cpt_end_arg; -+ __u64 cpt_start_env; -+ __u64 cpt_end_env; -+ __u64 cpt_def_flags; -+ __u64 cpt_mmub; -+ __u8 cpt_dumpable; -+ __u8 cpt_vps_dumpable; -+ __u8 cpt_used_hugetlb; -+ __u8 __cpt_pad; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_page_block -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_start; -+ __u64 cpt_end; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_remappage_block -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_start; -+ __u64 cpt_end; -+ __u64 cpt_pgoff; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_copypage_block -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_start; -+ __u64 cpt_end; -+ __u64 cpt_source; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_lazypage_block -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_start; -+ __u64 cpt_end; -+ __u64 cpt_index; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_vma_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_file; -+ __u32 cpt_type; -+#define CPT_VMA_TYPE_0 0 -+#define CPT_VMA_TYPE_SHM 1 -+ __u32 cpt_anonvma; -+ __u64 cpt_anonvmaid; -+ -+ __u64 cpt_start; -+ __u64 cpt_end; -+ __u64 cpt_flags; -+ __u64 cpt_pgprot; -+ __u64 cpt_pgoff; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_aio_ctx_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_max_reqs; -+ __u32 cpt_ring_pages; -+ __u32 cpt_tail; -+ __u32 cpt_nr; -+ __u64 cpt_mmap_base; -+ /* Data (io_event's) and struct aio_ring are stored in user space VM */ -+} __attribute__ ((aligned (8))); -+ -+ -+/* Format of MM section. -+ * -+ * It is array of MM objects (mm_struct). Each MM object is -+ * header, encoding mm_struct, followed by array of VMA objects. -+ * Each VMA consists of VMA header, encoding vm_area_struct, and -+ * if the VMA contains copied pages, the header is followed by -+ * array of tuples start-end each followed by data. -+ * -+ * ATTN: no block/page alignment. Only 64bit alignment. This might be not good? -+ */ -+ -+struct cpt_restart_block { -+ __u64 fn; -+#define CPT_RBL_0 0 -+#define CPT_RBL_NANOSLEEP 1 -+#define CPT_RBL_COMPAT_NANOSLEEP 2 -+ __u64 arg0; -+ __u64 arg1; -+ __u64 arg2; -+ __u64 arg3; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_siginfo_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_qflags; -+ __u32 cpt_signo; -+ __u32 cpt_errno; -+ __u32 cpt_code; -+ -+ __u64 cpt_sigval; -+ __u32 cpt_pid; -+ __u32 cpt_uid; -+ __u64 cpt_utime; -+ __u64 cpt_stime; -+ -+ __u64 cpt_user; -+} __attribute__ ((aligned (8))); -+ -+/* Portable presentaions for segment registers */ -+ -+#define CPT_SEG_ZERO 0 -+#define CPT_SEG_TLS1 1 -+#define CPT_SEG_TLS2 2 -+#define CPT_SEG_TLS3 3 -+#define CPT_SEG_USER32_DS 4 -+#define CPT_SEG_USER32_CS 5 -+#define CPT_SEG_USER64_DS 6 -+#define CPT_SEG_USER64_CS 7 -+#define CPT_SEG_LDT 256 -+ -+struct cpt_x86_regs -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_debugreg[8]; -+ __u32 cpt_fs; -+ __u32 cpt_gs; -+ -+ __u32 cpt_ebx; -+ __u32 cpt_ecx; -+ __u32 cpt_edx; -+ __u32 cpt_esi; -+ __u32 cpt_edi; -+ __u32 cpt_ebp; -+ __u32 cpt_eax; -+ __u32 cpt_xds; -+ __u32 cpt_xes; -+ __u32 cpt_orig_eax; -+ __u32 cpt_eip; -+ __u32 cpt_xcs; -+ __u32 cpt_eflags; -+ __u32 cpt_esp; -+ __u32 cpt_xss; -+ __u32 cpt_pad; -+}; -+ -+struct cpt_x86_64_regs -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_debugreg[8]; -+ -+ __u64 cpt_fsbase; -+ __u64 cpt_gsbase; -+ __u32 cpt_fsindex; -+ __u32 cpt_gsindex; -+ __u32 cpt_ds; -+ __u32 cpt_es; -+ -+ __u64 cpt_r15; -+ __u64 cpt_r14; -+ __u64 cpt_r13; -+ __u64 cpt_r12; -+ __u64 cpt_rbp; -+ __u64 cpt_rbx; -+ __u64 cpt_r11; -+ __u64 cpt_r10; -+ __u64 cpt_r9; -+ __u64 cpt_r8; -+ __u64 cpt_rax; -+ __u64 cpt_rcx; -+ __u64 cpt_rdx; -+ __u64 cpt_rsi; -+ __u64 cpt_rdi; -+ __u64 cpt_orig_rax; -+ __u64 cpt_rip; -+ __u64 cpt_cs; -+ __u64 cpt_eflags; -+ __u64 cpt_rsp; -+ __u64 cpt_ss; -+}; -+ -+struct cpt_task_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_state; -+ __u64 cpt_flags; -+ __u64 cpt_ptrace; -+ __u32 cpt_prio; -+ __u32 cpt_static_prio; -+ __u32 cpt_policy; -+ __u32 cpt_rt_priority; -+ -+ /* struct thread_info */ -+ __u64 cpt_exec_domain; -+ __u64 cpt_thrflags; -+ __u64 cpt_thrstatus; -+ __u64 cpt_addr_limit; -+ -+ __u64 cpt_personality; -+ -+ __u64 cpt_mm; -+ __u64 cpt_files; -+ __u64 cpt_fs; -+ __u64 cpt_signal; -+ __u64 cpt_sighand; -+ __u64 cpt_sigblocked; -+ __u64 cpt_sigrblocked; -+ __u64 cpt_sigpending; -+ __u64 cpt_namespace; -+ __u64 cpt_sysvsem_undo; -+ __u32 cpt_pid; -+ __u32 cpt_tgid; -+ __u32 cpt_ppid; -+ __u32 cpt_rppid; -+ __u32 cpt_pgrp; -+ __u32 cpt_session; -+ __u32 cpt_old_pgrp; -+ __u32 __cpt_pad; -+ __u32 cpt_leader; -+ __u8 cpt_pn_state; -+ __u8 cpt_stopped_state; -+ __u8 cpt_sigsuspend_state; -+ __u8 cpt_64bit; -+ __u64 cpt_set_tid; -+ __u64 cpt_clear_tid; -+ __u32 cpt_exit_code; -+ __u32 cpt_exit_signal; -+ __u32 cpt_pdeath_signal; -+ __u32 cpt_user; -+ __u32 cpt_uid; -+ __u32 cpt_euid; -+ __u32 cpt_suid; -+ __u32 cpt_fsuid; -+ __u32 cpt_gid; -+ __u32 cpt_egid; -+ __u32 cpt_sgid; -+ __u32 cpt_fsgid; -+ __u32 cpt_ngids; -+ __u32 cpt_gids[32]; -+ __u32 __cpt_pad2; -+ __u64 cpt_ecap; -+ __u64 cpt_icap; -+ __u64 cpt_pcap; -+ __u8 cpt_comm[16]; -+ __u64 cpt_tls[3]; -+ struct cpt_restart_block cpt_restart; -+ __u64 cpt_it_real_value; /* V0: jiffies, V1: nsec */ -+ __u64 cpt_it_real_incr; /* V0: jiffies, V1: nsec */ -+ __u64 cpt_it_prof_value; -+ __u64 cpt_it_prof_incr; -+ __u64 cpt_it_virt_value; -+ __u64 cpt_it_virt_incr; -+ -+ __u16 cpt_used_math; -+ __u8 cpt_keepcap; -+ __u8 cpt_did_exec; -+ __u32 cpt_ptrace_message; -+ -+ __u64 cpt_utime; -+ __u64 cpt_stime; -+ __u64 cpt_starttime; /* V0: jiffies, V1: timespec */ -+ __u64 cpt_nvcsw; -+ __u64 cpt_nivcsw; -+ __u64 cpt_min_flt; -+ __u64 cpt_maj_flt; -+ -+ __u64 cpt_sigsuspend_blocked; -+ __u64 cpt_cutime, cpt_cstime; -+ __u64 cpt_cnvcsw, cpt_cnivcsw; -+ __u64 cpt_cmin_flt, cpt_cmaj_flt; -+ -+#define CPT_RLIM_NLIMITS 16 -+ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; -+ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; -+ -+ __u64 cpt_task_ub; -+ __u64 cpt_exec_ub; -+ __u64 cpt_mm_ub; -+ __u64 cpt_fork_sub; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_signal_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_leader; -+ __u8 cpt_pgrp_type; -+ __u8 cpt_old_pgrp_type; -+ __u8 cpt_session_type; -+#define CPT_PGRP_NORMAL 0 -+#define CPT_PGRP_ORPHAN 1 -+#define CPT_PGRP_STRAY 2 -+ __u8 __cpt_pad1; -+ __u64 cpt_pgrp; -+ __u64 cpt_old_pgrp; -+ __u64 cpt_session; -+ __u64 cpt_sigpending; -+ __u64 cpt_ctty; -+ -+ __u32 cpt_curr_target; -+ __u32 cpt_group_exit; -+ __u32 cpt_group_exit_code; -+ __u32 cpt_group_exit_task; -+ __u32 cpt_notify_count; -+ __u32 cpt_group_stop_count; -+ __u32 cpt_stop_state; -+ __u32 __cpt_pad2; -+ -+ __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime; -+ __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw; -+ __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt; -+ -+ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; -+ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; -+} __attribute__ ((aligned (8))); -+/* Followed by list of posix timers. */ -+ -+struct cpt_sighand_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+} __attribute__ ((aligned (8))); -+/* Followed by list of sighandles. */ -+ -+struct cpt_sighandler_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_signo; -+ __u32 __cpt_pad1; -+ __u64 cpt_handler; -+ __u64 cpt_restorer; -+ __u64 cpt_flags; -+ __u64 cpt_mask; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_netdev_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_index; -+ __u32 cpt_flags; -+ __u8 cpt_name[16]; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_ifaddr_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u32 cpt_index; -+ __u8 cpt_family; -+ __u8 cpt_masklen; -+ __u8 cpt_flags; -+ __u8 cpt_scope; -+ __u32 cpt_address[4]; -+ __u32 cpt_peer[4]; -+ __u32 cpt_broadcast[4]; -+ __u8 cpt_label[16]; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_ipct_tuple -+{ -+ __u32 cpt_src; -+ __u16 cpt_srcport; -+ __u16 __cpt_pad1; -+ -+ __u32 cpt_dst; -+ __u16 cpt_dstport; -+ __u8 cpt_protonum; -+ __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */ -+} __attribute__ ((aligned (8))); -+ -+struct cpt_nat_manip -+{ -+ __u8 cpt_direction; -+ __u8 cpt_hooknum; -+ __u8 cpt_maniptype; -+ __u8 __cpt_pad1; -+ -+ __u32 cpt_manip_addr; -+ __u16 cpt_manip_port; -+ __u16 __cpt_pad2; -+ __u32 __cpt_pad3; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_nat_seq -+{ -+ __u32 cpt_correction_pos; -+ __u32 cpt_offset_before; -+ __u32 cpt_offset_after; -+ __u32 __cpt_pad1; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_ip_connexpect_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_timeout; -+ __u32 cpt_sibling_conntrack; /* Index of child conntrack */ -+ __u32 cpt_seq; /* id in 2.6.15 */ -+ -+ struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */ -+ struct cpt_ipct_tuple cpt_tuple; -+ struct cpt_ipct_tuple cpt_mask; -+ -+ /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */ -+ __u32 cpt_help[3]; /* NU 2.6.15 */ -+ __u16 cpt_manip_proto; -+ __u8 cpt_dir; -+ __u8 cpt_flags; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_ip_conntrack_image -+{ -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ struct cpt_ipct_tuple cpt_tuple[2]; -+ __u64 cpt_status; -+ __u64 cpt_timeout; -+ __u32 cpt_index; -+ __u8 cpt_ct_helper; -+ __u8 cpt_nat_helper; -+ __u16 cpt_pad1; -+ -+ /* union ip_conntrack_proto. Used by tcp and icmp. */ -+ __u32 cpt_proto_data[12]; -+ -+ /* union ip_conntrack_help. Used by ftp and pptp helper. -+ * We do not support pptp... -+ */ -+ __u32 cpt_help_data[6]; -+ -+ /* nat info */ -+ __u32 cpt_initialized; /* NU 2.6.15 */ -+ __u32 cpt_num_manips; /* NU 2.6.15 */ -+ struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */ -+ -+ struct cpt_nat_seq cpt_nat_seq[2]; -+ -+ __u32 cpt_masq_index; -+ __u32 cpt_id; -+ __u32 cpt_mark; -+} __attribute__ ((aligned (8))); -+ -+struct cpt_beancounter_image { -+ __u64 cpt_next; -+ __u32 cpt_object; -+ __u16 cpt_hdrlen; -+ __u16 cpt_content; -+ -+ __u64 cpt_parent; -+ __u32 cpt_id; -+ __u32 __cpt_pad; -+ __u64 cpt_parms[32 * 6 * 2]; -+} __attribute__ ((aligned (8))); -+ -+#ifdef __KERNEL__ -+ -+static inline void *cpt_ptr_import(__u64 ptr) -+{ -+ return (void*)(unsigned long)ptr; -+} -+ -+static inline __u64 cpt_ptr_export(void __user *ptr) -+{ -+ return (__u64)(unsigned long)ptr; -+} -+ -+static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr) -+{ -+ memcpy(sig, &ptr, sizeof(*sig)); -+} -+ -+static inline __u64 cpt_sigset_export(sigset_t *sig) -+{ -+ return *(__u64*)sig; -+} -+ -+static inline __u64 cpt_timespec_export(struct timespec *tv) -+{ -+ return (((u64)tv->tv_sec) << 32) + tv->tv_nsec; -+} -+ -+static inline void cpt_timespec_import(struct timespec *tv, __u64 val) -+{ -+ tv->tv_sec = val>>32; -+ tv->tv_nsec = (val&0xFFFFFFFF); -+} -+ -+static inline __u64 cpt_timeval_export(struct timeval *tv) -+{ -+ return (((u64)tv->tv_sec) << 32) + tv->tv_usec; -+} -+ -+static inline void cpt_timeval_import(struct timeval *tv, __u64 val) -+{ -+ tv->tv_sec = val>>32; -+ tv->tv_usec = (val&0xFFFFFFFF); -+} -+ -+#endif -+ -+#endif /* __CPT_IMAGE_H_ */ -diff -upr linux-2.6.16.orig/include/linux/cpt_ioctl.h linux-2.6.16-026test009/include/linux/cpt_ioctl.h ---- linux-2.6.16.orig/include/linux/cpt_ioctl.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/cpt_ioctl.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,41 @@ -+/* -+ * -+ * include/linux/cpt_ioctl.h -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _CPT_IOCTL_H_ -+#define _CPT_IOCTL_H_ 1 -+ -+#include <linux/types.h> -+#include <linux/ioctl.h> -+ -+#define CPTCTLTYPE '-' -+#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int) -+#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int) -+#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int) -+#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int) -+#define CPT_SUSPEND _IO(CPTCTLTYPE, 5) -+#define CPT_DUMP _IO(CPTCTLTYPE, 6) -+#define CPT_UNDUMP _IO(CPTCTLTYPE, 7) -+#define CPT_RESUME _IO(CPTCTLTYPE, 8) -+#define CPT_KILL _IO(CPTCTLTYPE, 9) -+#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10) -+#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int) -+#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12) -+#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int) -+#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int) -+#define CPT_PAGEIND _IO(CPTCTLTYPE, 15) -+#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int) -+#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int) -+#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int) -+#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int) -+#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int) -+#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int) -+ -+#endif -diff -upr linux-2.6.16.orig/include/linux/cpu.h linux-2.6.16-026test009/include/linux/cpu.h ---- linux-2.6.16.orig/include/linux/cpu.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/cpu.h 2006-04-19 15:02:11.000000000 +0400 -@@ -32,7 +32,7 @@ struct cpu { - }; - - extern int register_cpu(struct cpu *, int, struct node *); --extern struct sys_device *get_cpu_sysdev(int cpu); -+extern struct sys_device *get_cpu_sysdev(unsigned cpu); - #ifdef CONFIG_HOTPLUG_CPU - extern void unregister_cpu(struct cpu *, struct node *); - #endif -diff -upr linux-2.6.16.orig/include/linux/dcache.h linux-2.6.16-026test009/include/linux/dcache.h ---- linux-2.6.16.orig/include/linux/dcache.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/dcache.h 2006-04-19 15:02:12.000000000 +0400 -@@ -9,6 +9,8 @@ - #include <linux/cache.h> - #include <linux/rcupdate.h> - -+#include <ub/ub_dcache.h> -+ - struct nameidata; - struct vfsmount; - -@@ -111,6 +113,9 @@ struct dentry { - struct dcookie_struct *d_cookie; /* cookie, if any */ - #endif - int d_mounted; -+#ifdef CONFIG_USER_RESOURCE -+ struct dentry_beancounter dentry_bc; -+#endif - unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ - }; - -@@ -161,7 +166,11 @@ d_iput: no no no yes - - #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ - #define DCACHE_UNHASHED 0x0010 -+#define DCACHE_VIRTUAL 0x0100 /* ve accessible */ -+ -+extern void mark_tree_virtual(struct vfsmount *m, struct dentry *d); - -+extern kmem_cache_t *dentry_cache; - extern spinlock_t dcache_lock; - - /** -@@ -215,7 +224,8 @@ extern struct dentry * d_alloc_anon(stru - extern struct dentry * d_splice_alias(struct inode *, struct dentry *); - extern void shrink_dcache_sb(struct super_block *); - extern void shrink_dcache_parent(struct dentry *); --extern void shrink_dcache_anon(struct hlist_head *); -+extern void shrink_dcache_anon(struct super_block *); -+extern void dcache_shrinker_wait_sb(struct super_block *sb); - extern int d_invalidate(struct dentry *); - - /* only used at mount-time */ -@@ -277,6 +287,7 @@ extern struct dentry * __d_lookup(struct - /* validate "insecure" dentry pointer */ - extern int d_validate(struct dentry *, struct dentry *); - -+extern int d_root_check(struct dentry *, struct vfsmount *); - extern char * d_path(struct dentry *, struct vfsmount *, char *, int); - - /* Allocation counts.. */ -@@ -297,6 +308,8 @@ extern char * d_path(struct dentry *, st - static inline struct dentry *dget(struct dentry *dentry) - { - if (dentry) { -+ if (ub_dget_testone(dentry)) -+ BUG(); - BUG_ON(!atomic_read(&dentry->d_count)); - atomic_inc(&dentry->d_count); - } -@@ -340,6 +353,8 @@ extern struct dentry *lookup_create(stru - - extern int sysctl_vfs_cache_pressure; - -+extern int check_area_access_ve(struct dentry *, struct vfsmount *); -+extern int check_area_execute_ve(struct dentry *, struct vfsmount *); - #endif /* __KERNEL__ */ - - #endif /* __LINUX_DCACHE_H */ -diff -upr linux-2.6.16.orig/include/linux/devpts_fs.h linux-2.6.16-026test009/include/linux/devpts_fs.h ---- linux-2.6.16.orig/include/linux/devpts_fs.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/devpts_fs.h 2006-04-19 15:02:12.000000000 +0400 -@@ -21,6 +21,15 @@ int devpts_pty_new(struct tty_struct *tt - struct tty_struct *devpts_get_tty(int number); /* get tty structure */ - void devpts_pty_kill(int number); /* unlink */ - -+struct devpts_config { -+ int setuid; -+ int setgid; -+ uid_t uid; -+ gid_t gid; -+ umode_t mode; -+}; -+ -+extern struct devpts_config devpts_config; - #else - - /* Dummy stubs in the no-pty case */ -diff -upr linux-2.6.16.orig/include/linux/elfcore.h linux-2.6.16-026test009/include/linux/elfcore.h ---- linux-2.6.16.orig/include/linux/elfcore.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/elfcore.h 2006-04-19 15:02:12.000000000 +0400 -@@ -7,6 +7,8 @@ - #include <linux/user.h> - #include <linux/ptrace.h> - -+extern int sysctl_at_vsyscall; -+ - struct elf_siginfo - { - int si_signo; /* signal number */ -diff -upr linux-2.6.16.orig/include/linux/eventpoll.h linux-2.6.16-026test009/include/linux/eventpoll.h ---- linux-2.6.16.orig/include/linux/eventpoll.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/eventpoll.h 2006-04-19 15:02:12.000000000 +0400 -@@ -85,6 +85,91 @@ static inline void eventpoll_release(str - eventpoll_release_file(file); - } - -+struct epoll_filefd { -+ struct file *file; -+ int fd; -+}; -+ -+/* -+ * This structure is stored inside the "private_data" member of the file -+ * structure and rapresent the main data sructure for the eventpoll -+ * interface. -+ */ -+struct eventpoll { -+ /* Protect the this structure access */ -+ rwlock_t lock; -+ -+ /* -+ * This semaphore is used to ensure that files are not removed -+ * while epoll is using them. This is read-held during the event -+ * collection loop and it is write-held during the file cleanup -+ * path, the epoll file exit code and the ctl operations. -+ */ -+ struct rw_semaphore sem; -+ -+ /* Wait queue used by sys_epoll_wait() */ -+ wait_queue_head_t wq; -+ -+ /* Wait queue used by file->poll() */ -+ wait_queue_head_t poll_wait; -+ -+ /* List of ready file descriptors */ -+ struct list_head rdllist; -+ -+ /* RB-Tree root used to store monitored fd structs */ -+ struct rb_root rbr; -+}; -+ -+/* -+ * Each file descriptor added to the eventpoll interface will -+ * have an entry of this type linked to the hash. -+ */ -+struct epitem { -+ /* RB-Tree node used to link this structure to the eventpoll rb-tree */ -+ struct rb_node rbn; -+ -+ /* List header used to link this structure to the eventpoll ready list */ -+ struct list_head rdllink; -+ -+ /* The file descriptor information this item refers to */ -+ struct epoll_filefd ffd; -+ -+ /* Number of active wait queue attached to poll operations */ -+ int nwait; -+ -+ /* List containing poll wait queues */ -+ struct list_head pwqlist; -+ -+ /* The "container" of this item */ -+ struct eventpoll *ep; -+ -+ /* The structure that describe the interested events and the source fd */ -+ struct epoll_event event; -+ -+ /* -+ * Used to keep track of the usage count of the structure. This avoids -+ * that the structure will desappear from underneath our processing. -+ */ -+ atomic_t usecnt; -+ -+ /* List header used to link this item to the "struct file" items list */ -+ struct list_head fllink; -+ -+ /* List header used to link the item to the transfer list */ -+ struct list_head txlink; -+ -+ /* -+ * This is used during the collection/transfer of events to userspace -+ * to pin items empty events set. -+ */ -+ unsigned int revents; -+}; -+ -+extern struct semaphore epsem; -+struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); -+int ep_insert(struct eventpoll *ep, struct epoll_event *event, -+ struct file *tfile, int fd); -+void ep_release_epitem(struct epitem *epi); - - #else - -diff -upr linux-2.6.16.orig/include/linux/faudit.h linux-2.6.16-026test009/include/linux/faudit.h ---- linux-2.6.16.orig/include/linux/faudit.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/faudit.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,38 @@ -+/* -+ * include/linux/faudit.h -+ * -+ * Copyright (C) 2005 SWSoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __FAUDIT_H_ -+#define __FAUDIT_H_ -+ -+#include <linux/config.h> -+#include <linux/virtinfo.h> -+ -+struct vfsmount; -+struct dentry; -+struct pt_regs; -+ -+struct faudit_regs_arg { -+ int err; -+ struct pt_regs *regs; -+}; -+ -+struct faudit_stat_arg { -+ int err; -+ struct vfsmount *mnt; -+ struct dentry *dentry; -+ void *stat; -+}; -+ -+#define VIRTINFO_FAUDIT (0) -+#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0) -+#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1) -+#define VIRTINFO_FAUDIT_STATFS64 (VIRTINFO_FAUDIT + 2) -+ -+#endif -diff -upr linux-2.6.16.orig/include/linux/fb.h linux-2.6.16-026test009/include/linux/fb.h ---- linux-2.6.16.orig/include/linux/fb.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/fb.h 2006-04-19 15:02:11.000000000 +0400 -@@ -839,12 +839,10 @@ struct fb_info { - #define FB_LEFT_POS(bpp) (32 - bpp) - #define FB_SHIFT_HIGH(val, bits) ((val) >> (bits)) - #define FB_SHIFT_LOW(val, bits) ((val) << (bits)) --#define FB_BIT_NR(b) (7 - (b)) - #else - #define FB_LEFT_POS(bpp) (0) - #define FB_SHIFT_HIGH(val, bits) ((val) << (bits)) - #define FB_SHIFT_LOW(val, bits) ((val) >> (bits)) --#define FB_BIT_NR(b) (b) - #endif - - /* -diff -upr linux-2.6.16.orig/include/linux/fs.h linux-2.6.16-026test009/include/linux/fs.h ---- linux-2.6.16.orig/include/linux/fs.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/fs.h 2006-04-19 15:02:12.000000000 +0400 -@@ -7,6 +7,7 @@ - */ - - #include <linux/config.h> -+#include <linux/ve_owner.h> - #include <linux/limits.h> - #include <linux/ioctl.h> - -@@ -64,6 +65,7 @@ extern int dir_notify_enable; - #define FMODE_LSEEK 4 - #define FMODE_PREAD 8 - #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ -+#define FMODE_QUOTACTL 4 - - #define RW_MASK 1 - #define RWA_MASK 2 -@@ -83,6 +85,7 @@ extern int dir_notify_enable; - /* public flags for file_system_type */ - #define FS_REQUIRES_DEV 1 - #define FS_BINARY_MOUNTDATA 2 -+#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */ - #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ - #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon - * as nfs_rename() will be cleaned up -@@ -297,6 +300,9 @@ struct iattr { - * Includes for diskquotas. - */ - #include <linux/quota.h> -+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) -+#include <linux/vzquota_qlnk.h> -+#endif - - /** - * enum positive_aop_returns - aop return codes with specific semantics -@@ -493,6 +499,9 @@ struct inode { - #ifdef CONFIG_QUOTA - struct dquot *i_dquot[MAXQUOTAS]; - #endif -+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) -+ struct vz_quota_ilink i_qlnk; -+#endif - /* These three should probably be a union */ - struct list_head i_devices; - struct pipe_inode_info *i_pipe; -@@ -527,6 +536,8 @@ struct inode { - #endif - }; - -+extern kmem_cache_t *inode_cachep; -+ - /* - * NOTE: in a 32bit arch with a preemptable kernel and - * an UP compile the i_size_read/write must be atomic -@@ -588,6 +599,20 @@ static inline unsigned imajor(struct ino - - extern struct block_device *I_BDEV(struct inode *inode); - -+struct exec_perm { -+ umode_t mode; -+ uid_t uid, gid; -+ int set; -+}; -+ -+static inline void set_exec_perm(struct exec_perm *perm, struct inode *ino) -+{ -+ perm->set = 1; -+ perm->mode = ino->i_mode; -+ perm->uid = ino->i_uid; -+ perm->gid = ino->i_gid; -+} -+ - struct fown_struct { - rwlock_t lock; /* protects pid, uid, euid fields */ - int pid; /* pid or -pgrp where SIGIO should be sent */ -@@ -646,7 +671,10 @@ struct file { - spinlock_t f_ep_lock; - #endif /* #ifdef CONFIG_EPOLL */ - struct address_space *f_mapping; -+ struct ve_struct *owner_env; - }; -+DCL_VE_OWNER_PROTO(FILP, struct file, owner_env) -+ - extern spinlock_t files_lock; - #define file_list_lock() spin_lock(&files_lock); - #define file_list_unlock() spin_unlock(&files_lock); -@@ -710,6 +738,9 @@ struct file_lock { - struct file *fl_file; - unsigned char fl_flags; - unsigned char fl_type; -+#ifdef CONFIG_USER_RESOURCE -+ unsigned char fl_charged; -+#endif - loff_t fl_start; - loff_t fl_end; - -@@ -831,6 +862,7 @@ struct super_block { - struct list_head s_io; /* parked for writeback */ - struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ - struct list_head s_files; -+ struct list_head s_dshrinkers; /* active dcache shrinkers */ - - struct block_device *s_bdev; - struct list_head s_instances; -@@ -902,7 +934,7 @@ static inline void unlock_super(struct s - /* - * VFS helper functions.. - */ --extern int vfs_permission(struct nameidata *, int); -+extern int vfs_permission(struct nameidata *, int, struct exec_perm *); - extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); - extern int vfs_mkdir(struct inode *, struct dentry *, int); - extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); -@@ -1041,7 +1073,8 @@ struct inode_operations { - void * (*follow_link) (struct dentry *, struct nameidata *); - void (*put_link) (struct dentry *, struct nameidata *, void *); - void (*truncate) (struct inode *); -- int (*permission) (struct inode *, int, struct nameidata *); -+ int (*permission) (struct inode *, int, struct nameidata *, -+ struct exec_perm *); - int (*setattr) (struct dentry *, struct iattr *); - int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); - int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); -@@ -1089,6 +1122,8 @@ struct super_operations { - - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); -+ -+ struct inode *(*get_quota_root)(struct super_block *); - }; - - /* Inode state bits. Protected by inode_lock. */ -@@ -1246,8 +1281,14 @@ struct file_system_type { - struct module *owner; - struct file_system_type * next; - struct list_head fs_supers; -+ struct ve_struct *owner_env; - }; - -+DCL_VE_OWNER_PROTO(FSTYPE, struct file_system_type, owner_env) -+ -+void get_filesystem(struct file_system_type *fs); -+void put_filesystem(struct file_system_type *fs); -+ - struct super_block *get_sb_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)); -@@ -1285,6 +1326,7 @@ extern struct vfsmount *kern_mount(struc - extern int may_umount_tree(struct vfsmount *); - extern int may_umount(struct vfsmount *); - extern void umount_tree(struct vfsmount *, int, struct list_head *); -+#define kern_umount mntput - extern void release_mounts(struct list_head *); - extern long do_mount(char *, char *, char *, unsigned long, void *); - extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); -@@ -1401,7 +1443,7 @@ extern void release_chrdev_list(void *); - #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ - extern const char *__bdevname(dev_t, char *buffer); - extern const char *bdevname(struct block_device *bdev, char *buffer); --extern struct block_device *lookup_bdev(const char *); -+extern struct block_device *lookup_bdev(const char *, int mode); - extern struct block_device *open_bdev_excl(const char *, int, void *); - extern void close_bdev_excl(struct block_device *); - extern void *acquire_blkdev_list(void); -@@ -1433,7 +1475,7 @@ extern int fs_may_remount_ro(struct supe - #define bio_data_dir(bio) ((bio)->bi_rw & 1) - - extern int check_disk_change(struct block_device *); --extern int invalidate_inodes(struct super_block *); -+extern int invalidate_inodes(struct super_block *, int); - extern int __invalidate_device(struct block_device *); - extern int invalidate_partition(struct gendisk *, int); - unsigned long invalidate_mapping_pages(struct address_space *mapping, -@@ -1463,9 +1505,10 @@ extern int do_remount_sb(struct super_bl - void *data, int force); - extern sector_t bmap(struct inode *, sector_t); - extern int notify_change(struct dentry *, struct iattr *); --extern int permission(struct inode *, int, struct nameidata *); -+extern int permission(struct inode *, int, struct nameidata *, -+ struct exec_perm *); - extern int generic_permission(struct inode *, int, -- int (*check_acl)(struct inode *, int)); -+ int (*check_acl)(struct inode *, int), struct exec_perm *); - - extern int get_write_access(struct inode *); - extern int deny_write_access(struct file *); -@@ -1484,7 +1527,9 @@ extern int open_namei(int dfd, const cha - extern int may_open(struct nameidata *, int, int); - - extern int kernel_read(struct file *, unsigned long, char *, unsigned long); --extern struct file * open_exec(const char *); -+ -+struct linux_binprm; -+extern struct file * open_exec(const char *, struct linux_binprm *); - - /* fs/dcache.c -- generic fs support functions */ - extern int is_subdir(struct dentry *, struct dentry *); -diff -upr linux-2.6.16.orig/include/linux/genhd.h linux-2.6.16-026test009/include/linux/genhd.h ---- linux-2.6.16.orig/include/linux/genhd.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/genhd.h 2006-04-19 15:02:12.000000000 +0400 -@@ -421,6 +421,7 @@ static inline struct block_device *bdget - return bdget(MKDEV(disk->major, disk->first_minor) + index); - } - -+extern struct subsystem block_subsys; - #endif - - #endif -diff -upr linux-2.6.16.orig/include/linux/gfp.h linux-2.6.16-026test009/include/linux/gfp.h ---- linux-2.6.16.orig/include/linux/gfp.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/gfp.h 2006-04-19 15:02:11.000000000 +0400 -@@ -47,6 +47,8 @@ struct vm_area_struct; - #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ - #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ - #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ -+#define __GFP_UBC ((__force gfp_t)0x40000u)/* charge kmem in buddy and slab */ -+#define __GFP_SOFT_UBC ((__force gfp_t)0x80000u)/* use soft charging */ - - #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ - #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) -@@ -55,14 +57,17 @@ struct vm_area_struct; - #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ - __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ - __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ -- __GFP_NOMEMALLOC|__GFP_HARDWALL) -+ __GFP_NOMEMALLOC|__GFP_HARDWALL| \ -+ __GFP_UBC|__GFP_SOFT_UBC) - - /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ - #define GFP_ATOMIC (__GFP_HIGH) - #define GFP_NOIO (__GFP_WAIT) - #define GFP_NOFS (__GFP_WAIT | __GFP_IO) - #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) -+#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC) - #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) -+#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC) - #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ - __GFP_HIGHMEM) - -diff -upr linux-2.6.16.orig/include/linux/hrtimer.h linux-2.6.16-026test009/include/linux/hrtimer.h ---- linux-2.6.16.orig/include/linux/hrtimer.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/hrtimer.h 2006-04-19 15:02:12.000000000 +0400 -@@ -140,4 +140,9 @@ extern void hrtimer_run_queues(void); - /* Bootup initialization: */ - extern void __init hrtimers_init(void); - -+extern long nanosleep_restart(struct restart_block *restart); -+ -+extern ktime_t schedule_hrtimer(struct hrtimer *timer, -+ const enum hrtimer_mode mode); -+ - #endif -diff -upr linux-2.6.16.orig/include/linux/inetdevice.h linux-2.6.16-026test009/include/linux/inetdevice.h ---- linux-2.6.16.orig/include/linux/inetdevice.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/inetdevice.h 2006-04-19 15:02:12.000000000 +0400 -@@ -34,6 +34,12 @@ struct ipv4_devconf - }; - - extern struct ipv4_devconf ipv4_devconf; -+extern struct ipv4_devconf ipv4_devconf_dflt; -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ve_ipv4_devconf (*(get_exec_env()->_ipv4_devconf)) -+#else -+#define ve_ipv4_devconf ipv4_devconf -+#endif - - struct in_device - { -@@ -60,29 +66,29 @@ struct in_device - }; - - #define IN_DEV_FORWARD(in_dev) ((in_dev)->cnf.forwarding) --#define IN_DEV_MFORWARD(in_dev) (ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding) --#define IN_DEV_RPFILTER(in_dev) (ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter) --#define IN_DEV_SOURCE_ROUTE(in_dev) (ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route) --#define IN_DEV_BOOTP_RELAY(in_dev) (ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay) -- --#define IN_DEV_LOG_MARTIANS(in_dev) (ipv4_devconf.log_martians || (in_dev)->cnf.log_martians) --#define IN_DEV_PROXY_ARP(in_dev) (ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp) --#define IN_DEV_SHARED_MEDIA(in_dev) (ipv4_devconf.shared_media || (in_dev)->cnf.shared_media) --#define IN_DEV_TX_REDIRECTS(in_dev) (ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects) --#define IN_DEV_SEC_REDIRECTS(in_dev) (ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) -+#define IN_DEV_MFORWARD(in_dev) (ve_ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding) -+#define IN_DEV_RPFILTER(in_dev) (ve_ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter) -+#define IN_DEV_SOURCE_ROUTE(in_dev) (ve_ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route) -+#define IN_DEV_BOOTP_RELAY(in_dev) (ve_ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay) -+ -+#define IN_DEV_LOG_MARTIANS(in_dev) (ve_ipv4_devconf.log_martians || (in_dev)->cnf.log_martians) -+#define IN_DEV_PROXY_ARP(in_dev) (ve_ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp) -+#define IN_DEV_SHARED_MEDIA(in_dev) (ve_ipv4_devconf.shared_media || (in_dev)->cnf.shared_media) -+#define IN_DEV_TX_REDIRECTS(in_dev) (ve_ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects) -+#define IN_DEV_SEC_REDIRECTS(in_dev) (ve_ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) - #define IN_DEV_IDTAG(in_dev) ((in_dev)->cnf.tag) - #define IN_DEV_MEDIUM_ID(in_dev) ((in_dev)->cnf.medium_id) - #define IN_DEV_PROMOTE_SECONDARIES(in_dev) (ipv4_devconf.promote_secondaries || (in_dev)->cnf.promote_secondaries) - - #define IN_DEV_RX_REDIRECTS(in_dev) \ - ((IN_DEV_FORWARD(in_dev) && \ -- (ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \ -+ (ve_ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \ - || (!IN_DEV_FORWARD(in_dev) && \ -- (ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects))) -+ (ve_ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects))) - --#define IN_DEV_ARPFILTER(in_dev) (ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter) --#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce)) --#define IN_DEV_ARP_IGNORE(in_dev) (max(ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore)) -+#define IN_DEV_ARPFILTER(in_dev) (ve_ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter) -+#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ve_ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce)) -+#define IN_DEV_ARP_IGNORE(in_dev) (max(ve_ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore)) - - struct in_ifaddr - { -@@ -113,6 +119,7 @@ extern u32 inet_select_addr(const struc - extern u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope); - extern struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask); - extern void inet_forward_change(void); -+extern void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); - - static __inline__ int inet_ifa_match(u32 addr, struct in_ifaddr *ifa) - { -@@ -180,6 +187,10 @@ static inline void in_dev_put(struct in_ - #define __in_dev_put(idev) atomic_dec(&(idev)->refcnt) - #define in_dev_hold(idev) atomic_inc(&(idev)->refcnt) - -+struct ve_struct; -+extern int devinet_sysctl_init(struct ve_struct *); -+extern void devinet_sysctl_fini(struct ve_struct *); -+extern void devinet_sysctl_free(struct ve_struct *); - #endif /* __KERNEL__ */ - - static __inline__ __u32 inet_make_mask(int logmask) -diff -upr linux-2.6.16.orig/include/linux/jbd.h linux-2.6.16-026test009/include/linux/jbd.h ---- linux-2.6.16.orig/include/linux/jbd.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/jbd.h 2006-04-19 15:02:11.000000000 +0400 -@@ -245,10 +245,15 @@ typedef struct journal_superblock_s - #define J_ASSERT(assert) \ - do { \ - if (!(assert)) { \ -+ unsigned long stack; \ - printk (KERN_EMERG \ - "Assertion failure in %s() at %s:%d: \"%s\"\n", \ - __FUNCTION__, __FILE__, __LINE__, # assert); \ -- BUG(); \ -+ printk("Stack=%p current=%p pid=%d ve=%d comm='%s'\n", \ -+ &stack, current, current->pid, \ -+ get_exec_env()->veid, \ -+ current->comm); \ -+ dump_stack(); \ - } \ - } while (0) - -diff -upr linux-2.6.16.orig/include/linux/kdev_t.h linux-2.6.16-026test009/include/linux/kdev_t.h ---- linux-2.6.16.orig/include/linux/kdev_t.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/kdev_t.h 2006-04-19 15:02:12.000000000 +0400 -@@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 de - return dev & 0x3ffff; - } - -+#define UNNAMED_MAJOR_COUNT 16 -+ -+#if UNNAMED_MAJOR_COUNT > 1 -+ -+extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT]; -+ -+static inline dev_t make_unnamed_dev(int idx) -+{ -+ /* -+ * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the -+ * unnamed device index into major number. -+ */ -+ return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)], -+ idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8)); -+} -+ -+static inline int unnamed_dev_idx(dev_t dev) -+{ -+ int i; -+ for (i = 0; i < UNNAMED_MAJOR_COUNT && -+ MAJOR(dev) != unnamed_dev_majors[i]; i++); -+ return MINOR(dev) | (i << 8); -+} -+ -+static inline int is_unnamed_dev(dev_t dev) -+{ -+ int i; -+ for (i = 0; i < UNNAMED_MAJOR_COUNT && -+ MAJOR(dev) != unnamed_dev_majors[i]; i++); -+ return i < UNNAMED_MAJOR_COUNT; -+} -+ -+#else /* UNNAMED_MAJOR_COUNT */ -+ -+static inline dev_t make_unnamed_dev(int idx) -+{ -+ return MKDEV(0, idx); -+} -+ -+static inline int unnamed_dev_idx(dev_t dev) -+{ -+ return MINOR(dev); -+} -+ -+static inline int is_unnamed_dev(dev_t dev) -+{ -+ return MAJOR(dev) == 0; -+} -+ -+#endif /* UNNAMED_MAJOR_COUNT */ -+ - - #else /* __KERNEL__ */ - -diff -upr linux-2.6.16.orig/include/linux/kernel.h linux-2.6.16-026test009/include/linux/kernel.h ---- linux-2.6.16.orig/include/linux/kernel.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/kernel.h 2006-04-19 15:02:12.000000000 +0400 -@@ -132,6 +132,9 @@ asmlinkage int vprintk(const char *fmt, - __attribute__ ((format (printf, 1, 0))); - asmlinkage int printk(const char * fmt, ...) - __attribute__ ((format (printf, 1, 2))); -+asmlinkage int ve_printk(int, const char * fmt, ...) -+ __attribute__ ((format (printf, 2, 3))); -+void prepare_printk(void); - #else - static inline int vprintk(const char *s, va_list args) - __attribute__ ((format (printf, 1, 0))); -@@ -139,8 +142,16 @@ static inline int vprintk(const char *s, - static inline int printk(const char *s, ...) - __attribute__ ((format (printf, 1, 2))); - static inline int printk(const char *s, ...) { return 0; } -+static inline int ve_printk(int d, const char *s, ...) -+ __attribute__ ((format (printf, 1, 2))); -+static inline int printk(int d, const char *s, ...) { return 0; } -+#define prepare_printk() do { } while (0) - #endif - -+#define VE0_LOG 1 -+#define VE_LOG 2 -+#define VE_LOG_BOTH (VE0_LOG | VE_LOG) -+ - unsigned long int_sqrt(unsigned long); - - static inline int __attribute_pure__ long_log2(unsigned long x) -@@ -171,10 +182,13 @@ static inline void console_verbose(void) - } - - extern void bust_spinlocks(int yes); -+extern void wake_up_klogd(void); - extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ - extern __deprecated_for_modules int panic_timeout; - extern int panic_on_oops; -+extern int decode_call_traces; - extern int tainted; -+extern int kernel_text_csum_broken; - extern const char *print_tainted(void); - extern void add_taint(unsigned); - -diff -upr linux-2.6.16.orig/include/linux/kmem_cache.h linux-2.6.16-026test009/include/linux/kmem_cache.h ---- linux-2.6.16.orig/include/linux/kmem_cache.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/kmem_cache.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,199 @@ -+#ifndef __KMEM_CACHE_H__ -+#define __KMEM_CACHE_H__ -+#include <linux/threads.h> -+#include <linux/smp.h> -+#include <linux/spinlock.h> -+#include <linux/list.h> -+#include <linux/mm.h> -+#include <asm/atomic.h> -+ -+/* -+ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, -+ * SLAB_RED_ZONE & SLAB_POISON. -+ * 0 for faster, smaller code (especially in the critical paths). -+ * -+ * STATS - 1 to collect stats for /proc/slabinfo. -+ * 0 for faster, smaller code (especially in the critical paths). -+ * -+ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) -+ */ -+ -+#ifdef CONFIG_DEBUG_SLAB -+#define SLAB_DEBUG 1 -+#define SLAB_STATS 1 -+#define SLAB_FORCED_DEBUG 1 -+#else -+#define SLAB_DEBUG 0 -+#define SLAB_STATS 0 -+#define SLAB_FORCED_DEBUG 0 -+#endif -+ -+/* -+ * struct array_cache -+ * -+ * Purpose: -+ * - LIFO ordering, to hand out cache-warm objects from _alloc -+ * - reduce the number of linked list operations -+ * - reduce spinlock operations -+ * -+ * The limit is stored in the per-cpu structure to reduce the data cache -+ * footprint. -+ * -+ */ -+struct array_cache { -+ unsigned int avail; -+ unsigned int limit; -+ unsigned int batchcount; -+ unsigned int touched; -+ spinlock_t lock; -+ void *entry[0]; /* -+ * Must have this definition in here for the proper -+ * alignment of array_cache. Also simplifies accessing -+ * the entries. -+ * [0] is for gcc 2.95. It should really be []. -+ */ -+}; -+ -+/* bootstrap: The caches do not work without cpuarrays anymore, -+ * but the cpuarrays are allocated from the generic caches... -+ */ -+#define BOOT_CPUCACHE_ENTRIES 1 -+struct arraycache_init { -+ struct array_cache cache; -+ void *entries[BOOT_CPUCACHE_ENTRIES]; -+}; -+ -+/* -+ * The slab lists for all objects. -+ */ -+struct kmem_list3 { -+ struct list_head slabs_partial; /* partial list first, better asm code */ -+ struct list_head slabs_full; -+ struct list_head slabs_free; -+ unsigned long free_objects; -+ unsigned long next_reap; -+ int free_touched; -+ unsigned int free_limit; -+ unsigned int colour_next; /* Per-node cache coloring */ -+ spinlock_t list_lock; -+ struct array_cache *shared; /* shared per node */ -+ struct array_cache **alien; /* on other nodes */ -+}; -+ -+/* -+ * struct kmem_cache -+ * -+ * manages a cache. -+ */ -+ -+struct kmem_cache { -+/* 1) per-cpu data, touched during every alloc/free */ -+ struct array_cache *array[NR_CPUS]; -+ unsigned int batchcount; -+ unsigned int limit; -+ unsigned int shared; -+ unsigned int buffer_size; -+/* 2) touched by every alloc & free from the backend */ -+ struct kmem_list3 *nodelists[MAX_NUMNODES]; -+ unsigned int flags; /* constant flags */ -+ unsigned int num; /* # of objs per slab */ -+ spinlock_t spinlock; -+ -+/* 3) cache_grow/shrink */ -+ /* order of pgs per slab (2^n) */ -+ unsigned int gfporder; -+ -+ /* force GFP flags, e.g. GFP_DMA */ -+ gfp_t gfpflags; -+ -+ size_t colour; /* cache colouring range */ -+ unsigned int colour_off; /* colour offset */ -+ struct kmem_cache *slabp_cache; -+ unsigned int slab_size; -+ unsigned int dflags; /* dynamic flags */ -+ -+ /* constructor func */ -+ void (*ctor) (void *, struct kmem_cache *, unsigned long); -+ -+ /* de-constructor func */ -+ void (*dtor) (void *, struct kmem_cache *, unsigned long); -+ -+/* 4) cache creation/removal */ -+ const char *name; -+ struct list_head next; -+ -+/* 5) statistics */ -+#if SLAB_STATS -+ unsigned long num_active; -+ unsigned long num_allocations; -+ unsigned long high_mark; -+ unsigned long grown; -+ unsigned long reaped; -+ unsigned long errors; -+ unsigned long max_freeable; -+ unsigned long node_allocs; -+ unsigned long node_frees; -+ atomic_t allochit; -+ atomic_t allocmiss; -+ atomic_t freehit; -+ atomic_t freemiss; -+#endif -+#if SLAB_DEBUG -+ /* -+ * If debugging is enabled, then the allocator can add additional -+ * fields and/or padding to every object. buffer_size contains the total -+ * object size including these internal fields, the following two -+ * variables contain the offset to the user object and its size. -+ */ -+ int obj_offset; -+ int obj_size; -+#endif -+#ifdef CONFIG_USER_RESOURCE -+ unsigned int objuse; -+#endif -+}; -+ -+#define CFLGS_OFF_SLAB (0x80000000UL) -+#define CFLGS_ENVIDS (0x04000000UL) -+#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) -+#define ENVIDS(x) ((x)->flags & CFLGS_ENVIDS) -+#define kmem_mark_nocharge(c) do { (c)->flags |= SLAB_NO_CHARGE; } while (0) -+ -+struct slab; -+/* Functions for storing/retrieving the cachep and or slab from the -+ * global 'mem_map'. These are used to find the slab an obj belongs to. -+ * With kfree(), these are used to find the cache which an obj belongs to. -+ */ -+static inline void page_set_cache(struct page *page, struct kmem_cache *cache) -+{ -+ page->lru.next = (struct list_head *)cache; -+} -+ -+static inline struct kmem_cache *page_get_cache(struct page *page) -+{ -+ return (struct kmem_cache *)page->lru.next; -+} -+ -+static inline void page_set_slab(struct page *page, struct slab *slab) -+{ -+ page->lru.prev = (struct list_head *)slab; -+} -+ -+static inline struct slab *page_get_slab(struct page *page) -+{ -+ return (struct slab *)page->lru.prev; -+} -+ -+static inline struct kmem_cache *virt_to_cache(const void *obj) -+{ -+ struct page *page = virt_to_page(obj); -+ return page_get_cache(page); -+} -+ -+static inline struct slab *virt_to_slab(const void *obj) -+{ -+ struct page *page = virt_to_page(obj); -+ return page_get_slab(page); -+} -+ -+#endif -diff -upr linux-2.6.16.orig/include/linux/kmem_slab.h linux-2.6.16-026test009/include/linux/kmem_slab.h ---- linux-2.6.16.orig/include/linux/kmem_slab.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/kmem_slab.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,71 @@ -+#ifndef __KMEM_SLAB_H__ -+#define __KMEM_SLAB_H__ -+ -+/* -+ * kmem_bufctl_t: -+ * -+ * Bufctl's are used for linking objs within a slab -+ * linked offsets. -+ * -+ * This implementation relies on "struct page" for locating the cache & -+ * slab an object belongs to. -+ * This allows the bufctl structure to be small (one int), but limits -+ * the number of objects a slab (not a cache) can contain when off-slab -+ * bufctls are used. The limit is the size of the largest general cache -+ * that does not use off-slab slabs. -+ * For 32bit archs with 4 kB pages, is this 56. -+ * This is not serious, as it is only for large objects, when it is unwise -+ * to have too many per slab. -+ * Note: This limit can be raised by introducing a general cache whose size -+ * is less than 512 (PAGE_SIZE<<3), but greater than 256. -+ */ -+ -+typedef unsigned int kmem_bufctl_t; -+#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) -+#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -+#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) -+ -+/* -+ * struct slab -+ * -+ * Manages the objs in a slab. Placed either at the beginning of mem allocated -+ * for a slab, or allocated from an general cache. -+ * Slabs are chained into three list: fully used, partial, fully free slabs. -+ */ -+struct slab { -+ struct list_head list; -+ unsigned long colouroff; -+ void *s_mem; /* including colour offset */ -+ unsigned int inuse; /* num of objs active in slab */ -+ kmem_bufctl_t free; -+ unsigned short nodeid; -+}; -+ -+/* -+ * struct slab_rcu -+ * -+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to -+ * arrange for kmem_freepages to be called via RCU. This is useful if -+ * we need to approach a kernel structure obliquely, from its address -+ * obtained without the usual locking. We can lock the structure to -+ * stabilize it and check it's still at the given address, only if we -+ * can be sure that the memory has not been meanwhile reused for some -+ * other kind of object (which our subsystem's lock might corrupt). -+ * -+ * rcu_read_lock before reading the address, then rcu_read_unlock after -+ * taking the spinlock within the structure expected at that address. -+ * -+ * We assume struct slab_rcu can overlay struct slab when destroying. -+ */ -+struct slab_rcu { -+ struct rcu_head head; -+ struct kmem_cache *cachep; -+ void *addr; -+}; -+ -+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) -+{ -+ return (kmem_bufctl_t *) (slabp + 1); -+} -+ -+#endif -diff -upr linux-2.6.16.orig/include/linux/list.h linux-2.6.16-026test009/include/linux/list.h ---- linux-2.6.16.orig/include/linux/list.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/list.h 2006-04-19 15:02:12.000000000 +0400 -@@ -325,6 +325,9 @@ static inline void list_splice_init(stru - #define list_entry(ptr, type, member) \ - container_of(ptr, type, member) - -+#define list_first_entry(ptr, type, member) \ -+ container_of((ptr)->next, type, member) -+ - /** - * list_for_each - iterate over a list - * @pos: the &struct list_head to use as a loop counter. -@@ -411,6 +414,20 @@ static inline void list_splice_init(stru - pos = list_entry(pos->member.next, typeof(*pos), member)) - - /** -+ * list_for_each_entry_continue_reverse - iterate backwards over list of given -+ * type continuing after existing point -+ * @pos: the type * to use as a loop counter. -+ * @head: the head for your list. -+ * @member: the name of the list_struct within the struct. -+ */ -+#define list_for_each_entry_continue_reverse(pos, head, member) \ -+ for (pos = list_entry(pos->member.prev, typeof(*pos), member), \ -+ prefetch(pos->member.prev); \ -+ &pos->member != (head); \ -+ pos = list_entry(pos->member.prev, typeof(*pos), member), \ -+ prefetch(pos->member.prev)) -+ -+/** - * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry - * @pos: the type * to use as a loop counter. - * @n: another type * to use as temporary storage -diff -upr linux-2.6.16.orig/include/linux/major.h linux-2.6.16-026test009/include/linux/major.h ---- linux-2.6.16.orig/include/linux/major.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/major.h 2006-04-19 15:02:12.000000000 +0400 -@@ -165,4 +165,7 @@ - - #define VIOTAPE_MAJOR 230 - -+#define UNNAMED_EXTRA_MAJOR 130 -+#define UNNAMED_EXTRA_MAJOR_COUNT 120 -+ - #endif -diff -upr linux-2.6.16.orig/include/linux/mm.h linux-2.6.16-026test009/include/linux/mm.h ---- linux-2.6.16.orig/include/linux/mm.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/mm.h 2006-04-19 15:02:12.000000000 +0400 -@@ -41,6 +41,27 @@ extern int sysctl_legacy_va_layout; - - #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) - -+#include <linux/mm_counter.h> -+ -+#ifdef CONFIG_USER_RESOURCE -+#define set_vma_rss(vma, v) set_mm_counter(vma, vm_rss, v) -+#define get_vma_rss(vma) get_mm_counter(vma, vm_rss) -+#define inc_vma_rss(vma) inc_mm_counter(vma, vm_rss) -+#define dec_vma_rss(vma) dec_mm_counter(vma, vm_rss) -+#define add_vma_rss(vma, v) add_mm_counter(vma, vm_rss, v) -+#define sub_vma_rss(vma, v) do { \ -+ if (unlikely(dec_mm_counter_chk(vma, vm_rss, v))) \ -+ warn_bad_rss(vma, v); \ -+ } while (0) -+#else -+#define set_vma_rss(vma, v) do { } while (0) -+#define get_vma_rss(vma) (0) -+#define inc_vma_rss(vma) do { } while (0) -+#define dec_vma_rss(vma) do { } while (0) -+#define add_vma_rss(vma, v) do { } while (0) -+#define sub_vma_rss(vma, v) do { } while (0) -+#endif -+ - /* - * Linux kernel virtual memory manager primitives. - * The idea being to have a "virtual" mm in the same way -@@ -111,6 +132,9 @@ struct vm_area_struct { - #ifdef CONFIG_NUMA - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ - #endif -+#ifdef CONFIG_USER_RESOURCE -+ mm_counter_t _vm_rss; -+#endif - }; - - /* -@@ -229,10 +253,9 @@ struct page { - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for -- * swp_entry_t if PageSwapCache. -- * When page is free, this -+ * swp_entry_t if PageSwapCache; - * indicates order in the buddy -- * system. -+ * system if PG_buddy is set. - */ - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. -@@ -264,6 +287,12 @@ struct page { - void *virtual; /* Kernel virtual address (NULL if - not kmapped, ie. highmem) */ - #endif /* WANT_PAGE_VIRTUAL */ -+#ifdef CONFIG_USER_RESOURCE -+ union { -+ struct user_beancounter *page_ub; -+ struct page_beancounter *page_pb; -+ } bc; -+#endif - }; - - #define page_private(page) ((page)->private) -@@ -636,16 +665,9 @@ struct page *shmem_nopage(struct vm_area - int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new); - struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr); --int shmem_lock(struct file *file, int lock, struct user_struct *user); - #else - #define shmem_nopage filemap_nopage - --static inline int shmem_lock(struct file *file, int lock, -- struct user_struct *user) --{ -- return 0; --} -- - static inline int shmem_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) - { -@@ -706,7 +728,9 @@ void free_pgd_range(struct mmu_gather ** - void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); - int copy_page_range(struct mm_struct *dst, struct mm_struct *src, -- struct vm_area_struct *vma); -+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -+int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, -+ unsigned long addr, size_t size); - int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, - unsigned long size, pgprot_t prot); - void unmap_mapping_range(struct address_space *mapping, -diff -upr linux-2.6.16.orig/include/linux/mm_counter.h linux-2.6.16-026test009/include/linux/mm_counter.h ---- linux-2.6.16.orig/include/linux/mm_counter.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/mm_counter.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,32 @@ -+#ifndef __MM_COUNTER_H_ -+#define __MM_COUNTER_H_ -+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS -+/* -+ * The mm counters are not protected by its page_table_lock, -+ * so must be incremented atomically. -+ */ -+#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) -+#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) -+#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) -+#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) -+#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) -+#define dec_mm_counter_chk(mm, member, value) \ -+ atomic_long_add_negative(-(value), &(mm)->_##member) -+typedef atomic_long_t mm_counter_t; -+ -+#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ -+/* -+ * The mm counters are protected by its page_table_lock, -+ * so can be incremented directly. -+ */ -+#define set_mm_counter(mm, member, value) (mm)->_##member = (value) -+#define get_mm_counter(mm, member) ((mm)->_##member) -+#define add_mm_counter(mm, member, value) (mm)->_##member += (value) -+#define inc_mm_counter(mm, member) (mm)->_##member++ -+#define dec_mm_counter(mm, member) (mm)->_##member-- -+#define dec_mm_counter_chk(mm, member, value) \ -+ (((mm)->_##member -= (value)) < 0) -+typedef unsigned long mm_counter_t; -+ -+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ -+#endif -diff -upr linux-2.6.16.orig/include/linux/msg.h linux-2.6.16-026test009/include/linux/msg.h ---- linux-2.6.16.orig/include/linux/msg.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/msg.h 2006-04-19 15:02:12.000000000 +0400 -@@ -92,6 +92,8 @@ struct msg_queue { - struct list_head q_senders; - }; - -+int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg); -+ - #endif /* __KERNEL__ */ - - #endif /* _LINUX_MSG_H */ -diff -upr linux-2.6.16.orig/include/linux/namei.h linux-2.6.16-026test009/include/linux/namei.h ---- linux-2.6.16.orig/include/linux/namei.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/namei.h 2006-04-19 15:02:12.000000000 +0400 -@@ -48,12 +48,15 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA - #define LOOKUP_PARENT 16 - #define LOOKUP_NOALT 32 - #define LOOKUP_REVAL 64 -+#define LOOKUP_STRICT 128 /* no symlinks or other filesystems */ -+ - /* - * Intent data - */ - #define LOOKUP_OPEN (0x0100) - #define LOOKUP_CREATE (0x0200) - #define LOOKUP_ACCESS (0x0400) -+#define LOOKUP_NOAREACHECK (0x0800) /* no area check on lookup */ - - extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); - extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *)); -diff -upr linux-2.6.16.orig/include/linux/namespace.h linux-2.6.16-026test009/include/linux/namespace.h ---- linux-2.6.16.orig/include/linux/namespace.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/namespace.h 2006-04-19 15:02:12.000000000 +0400 -@@ -13,6 +13,8 @@ struct namespace { - int event; - }; - -+extern struct rw_semaphore namespace_sem; -+ - extern int copy_namespace(int, struct task_struct *); - extern void __put_namespace(struct namespace *namespace); - extern struct namespace *dup_namespace(struct task_struct *, struct fs_struct *); -diff -upr linux-2.6.16.orig/include/linux/netdevice.h linux-2.6.16-026test009/include/linux/netdevice.h ---- linux-2.6.16.orig/include/linux/netdevice.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netdevice.h 2006-04-19 15:02:12.000000000 +0400 -@@ -37,6 +37,7 @@ - #include <linux/config.h> - #include <linux/device.h> - #include <linux/percpu.h> -+#include <linux/ctype.h> - - struct divert_blk; - struct vlan_group; -@@ -233,6 +234,11 @@ enum netdev_state_t - __LINK_STATE_LINKWATCH_PENDING - }; - -+struct netdev_bc { -+ struct user_beancounter *exec_ub, *owner_ub; -+}; -+ -+#define netdev_bc(dev) (&(dev)->dev_bc) - - /* - * This structure holds at boot time configured netdevice settings. They -@@ -309,6 +315,8 @@ struct net_device - #define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */ - #define NETIF_F_LLTX 4096 /* LockLess TX */ - #define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/ -+#define NETIF_F_VIRTUAL 0x40000000 /* can be registered in ve */ -+#define NETIF_F_VENET 0x80000000 /* Device is VENET device */ - - struct net_device *next_sched; - -@@ -431,6 +439,7 @@ struct net_device - enum { NETREG_UNINITIALIZED=0, - NETREG_REGISTERING, /* called register_netdevice */ - NETREG_REGISTERED, /* completed register todo */ -+ NETREG_REGISTER_ERR, /* register todo failed */ - NETREG_UNREGISTERING, /* called unregister_netdevice */ - NETREG_UNREGISTERED, /* completed unregister todo */ - NETREG_RELEASED, /* called free_netdev */ -@@ -500,8 +509,18 @@ struct net_device - struct divert_blk *divert; - #endif /* CONFIG_NET_DIVERT */ - -+ unsigned orig_mtu; /* MTU value before move to VE */ -+ struct ve_struct *owner_env; /* Owner VE of the interface */ -+ struct netdev_bc dev_bc; -+ - /* class/net/name entry */ - struct class_device class_dev; -+ -+#ifdef CONFIG_VE -+ /* List entry in global devices list to keep track of their names -+ * assignment */ -+ struct list_head dev_global_list_entry; -+#endif - }; - - #define NETDEV_ALIGN 32 -@@ -535,9 +554,23 @@ struct packet_type { - #include <linux/notifier.h> - - extern struct net_device loopback_dev; /* The loopback */ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define loopback_dev (*get_exec_env()->_loopback_dev) -+#define ve0_loopback (*get_ve0()->_loopback_dev) -+#define dev_base (get_exec_env()->_net_dev_base) -+#define visible_dev_head(x) (&(x)->_net_dev_head) -+#define visible_dev_index_head(x) (&(x)->_net_dev_index_head) -+#else - extern struct net_device *dev_base; /* All devices */ -+#define ve0_loopback loopback_dev -+#define visible_dev_head(x) NULL -+#define visible_dev_index_head(x) NULL -+#endif - extern rwlock_t dev_base_lock; /* Device list lock */ - -+struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env); -+struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env); -+ - extern int netdev_boot_setup_check(struct net_device *dev); - extern unsigned long netdev_boot_base(const char *prefix, int unit); - extern struct net_device *dev_getbyhwaddr(unsigned short type, char *hwaddr); -@@ -554,6 +587,7 @@ extern int dev_alloc_name(struct net_de - extern int dev_open(struct net_device *dev); - extern int dev_close(struct net_device *dev); - extern int dev_queue_xmit(struct sk_buff *skb); -+extern int dev_set_mtu(struct net_device *dev, int new_mtu); - extern int register_netdevice(struct net_device *dev); - extern int unregister_netdevice(struct net_device *dev); - extern void free_netdev(struct net_device *dev); -@@ -951,6 +985,18 @@ extern void dev_seq_stop(struct seq_file - - extern void linkwatch_run_queue(void); - -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+static inline int ve_is_dev_movable(struct net_device *dev) -+{ -+ return !(dev->features & NETIF_F_VIRTUAL); -+} -+#else -+static inline int ve_is_dev_movable(struct net_device *dev) -+{ -+ return 0; -+} -+#endif -+ - #endif /* __KERNEL__ */ - - #endif /* _LINUX_DEV_H */ -diff -upr linux-2.6.16.orig/include/linux/netfilter/nf_conntrack_ftp.h linux-2.6.16-026test009/include/linux/netfilter/nf_conntrack_ftp.h ---- linux-2.6.16.orig/include/linux/netfilter/nf_conntrack_ftp.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter/nf_conntrack_ftp.h 2006-04-19 15:02:12.000000000 +0400 -@@ -32,13 +32,22 @@ struct ip_conntrack_expect; - - /* For NAT to hook in when we find a packet which describes what other - * connection we should expect. */ --extern unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, -+typedef unsigned int (*ip_nat_helper_ftp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - enum ip_ct_ftp_type type, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp, - u32 *seq); -+extern ip_nat_helper_ftp_hook ip_nat_ftp_hook; -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_ip_nat_ftp_hook \ -+ ((ip_nat_helper_ftp_hook) \ -+ (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook)) -+#else -+#define ve_ip_nat_ftp_hook ip_nat_ftp_hook -+#endif - #endif /* __KERNEL__ */ - - #endif /* _NF_CONNTRACK_FTP_H */ -diff -upr linux-2.6.16.orig/include/linux/netfilter/x_tables.h linux-2.6.16-026test009/include/linux/netfilter/x_tables.h ---- linux-2.6.16.orig/include/linux/netfilter/x_tables.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter/x_tables.h 2006-04-19 15:02:12.000000000 +0400 -@@ -80,12 +80,19 @@ struct xt_counters_info - - #ifdef __KERNEL__ - -+#include <linux/config.h> - #include <linux/netdevice.h> - - #define ASSERT_READ_LOCK(x) - #define ASSERT_WRITE_LOCK(x) - #include <linux/netfilter_ipv4/listhelp.h> - -+#ifdef CONFIG_COMPAT -+#define COMPAT_TO_USER 1 -+#define COMPAT_FROM_USER -1 -+#define COMPAT_CALC_SIZE 0 -+#endif -+ - struct xt_match - { - struct list_head list; -@@ -118,6 +125,10 @@ struct xt_match - /* Called when entry of this type deleted. */ - void (*destroy)(void *matchinfo, unsigned int matchinfosize); - -+#ifdef CONFIG_COMPAT -+ /* Called when userspace align differs from kernel space one */ -+ int (*compat)(void *match, void **dstptr, int *size, int convert); -+#endif - /* Set this to THIS_MODULE if you are a module, otherwise NULL */ - struct module *me; - }; -@@ -154,6 +165,10 @@ struct xt_target - /* Called when entry of this type deleted. */ - void (*destroy)(void *targinfo, unsigned int targinfosize); - -+#ifdef CONFIG_COMPAT -+ /* Called when userspace align differs from kernel space one */ -+ int (*compat)(void *target, void **dstptr, int *size, int convert); -+#endif - /* Set this to THIS_MODULE if you are a module, otherwise NULL */ - struct module *me; - }; -@@ -211,6 +226,10 @@ extern int xt_register_table(struct xt_t - struct xt_table_info *bootstrap, - struct xt_table_info *newinfo); - extern void *xt_unregister_table(struct xt_table *table); -+extern struct xt_table *virt_xt_register_table(struct xt_table *table, -+ struct xt_table_info *bootstrap, -+ struct xt_table_info *newinfo); -+extern void *virt_xt_unregister_table(struct xt_table *table); - - extern struct xt_table_info *xt_replace_table(struct xt_table *table, - unsigned int num_counters, -@@ -233,6 +252,34 @@ extern void xt_proto_fini(int af); - extern struct xt_table_info *xt_alloc_table_info(unsigned int size); - extern void xt_free_table_info(struct xt_table_info *info); - -+#ifdef CONFIG_COMPAT -+#include <net/compat.h> -+ -+/* FIXME: this works only on 32 bit tasks -+ * need to change whole approach in order to calculate align as function of -+ * current task alignment */ -+ -+struct compat_xt_counters -+{ -+ u_int32_t cnt[4]; -+}; -+ -+struct compat_xt_counters_info -+{ -+ char name[XT_TABLE_MAXNAMELEN]; -+ compat_uint_t num_counters; -+ struct compat_xt_counters counters[0]; -+}; -+ -+#define COMPAT_XT_ALIGN(s) (((s) + (__alignof__(struct compat_xt_counters)-1)) \ -+ & ~(__alignof__(struct compat_xt_counters)-1)) -+ -+extern int ipt_match_align_compat(void *match, void **dstptr, -+ int *size, int off, int convert); -+extern int ipt_target_align_compat(void *target, void **dstptr, -+ int *size, int off, int convert); -+ -+#endif /* CONFIG_COMPAT */ - #endif /* __KERNEL__ */ - - #endif /* _X_TABLES_H */ -diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_conntrack.h linux-2.6.16-026test009/include/linux/netfilter/xt_conntrack.h ---- linux-2.6.16.orig/include/linux/netfilter/xt_conntrack.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter/xt_conntrack.h 2006-04-19 15:02:11.000000000 +0400 -@@ -5,6 +5,7 @@ - #ifndef _XT_CONNTRACK_H - #define _XT_CONNTRACK_H - -+#include <linux/config.h> - #include <linux/netfilter/nf_conntrack_tuple_common.h> - #include <linux/in.h> - -@@ -60,4 +61,21 @@ struct xt_conntrack_info - /* Inverse flags */ - u_int8_t invflags; - }; -+ -+#ifdef CONFIG_COMPAT -+struct compat_xt_conntrack_info -+{ -+ compat_uint_t statemask, statusmask; -+ -+ struct ip_conntrack_tuple tuple[IP_CT_DIR_MAX]; -+ struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX]; -+ -+ compat_ulong_t expires_min, expires_max; -+ -+ /* Flags word */ -+ u_int8_t flags; -+ /* Inverse flags */ -+ u_int8_t invflags; -+}; -+#endif - #endif /*_XT_CONNTRACK_H*/ -diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_helper.h linux-2.6.16-026test009/include/linux/netfilter/xt_helper.h ---- linux-2.6.16.orig/include/linux/netfilter/xt_helper.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter/xt_helper.h 2006-04-19 15:02:11.000000000 +0400 -@@ -1,8 +1,17 @@ - #ifndef _XT_HELPER_H - #define _XT_HELPER_H - -+#include <linux/config.h> -+ - struct xt_helper_info { - int invert; - char name[30]; - }; -+ -+#ifdef CONFIG_COMPAT -+struct compat_xt_helper_info { -+ compat_int_t invert; -+ char name[30]; -+}; -+#endif - #endif /* _XT_HELPER_H */ -diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_limit.h linux-2.6.16-026test009/include/linux/netfilter/xt_limit.h ---- linux-2.6.16.orig/include/linux/netfilter/xt_limit.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter/xt_limit.h 2006-04-19 15:02:11.000000000 +0400 -@@ -1,6 +1,8 @@ - #ifndef _XT_RATE_H - #define _XT_RATE_H - -+#include <linux/config.h> -+ - /* timings are in milliseconds. */ - #define XT_LIMIT_SCALE 10000 - -@@ -18,4 +20,19 @@ struct xt_rateinfo { - /* Ugly, ugly fucker. */ - struct xt_rateinfo *master; - }; -+ -+#ifdef CONFIG_COMPAT -+struct compat_xt_rateinfo { -+ u_int32_t avg; /* Average secs between packets * scale */ -+ u_int32_t burst; /* Period multiplier for upper limit. */ -+ -+ /* Used internally by the kernel */ -+ compat_ulong_t prev; -+ u_int32_t credit; -+ u_int32_t credit_cap, cost; -+ -+ /* Ugly, ugly fucker. */ -+ compat_uptr_t master; -+}; -+#endif - #endif /*_XT_RATE_H*/ -diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_state.h linux-2.6.16-026test009/include/linux/netfilter/xt_state.h ---- linux-2.6.16.orig/include/linux/netfilter/xt_state.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter/xt_state.h 2006-04-19 15:02:11.000000000 +0400 -@@ -1,6 +1,8 @@ - #ifndef _XT_STATE_H - #define _XT_STATE_H - -+#include <linux/config.h> -+ - #define XT_STATE_BIT(ctinfo) (1 << ((ctinfo)%IP_CT_IS_REPLY+1)) - #define XT_STATE_INVALID (1 << 0) - -@@ -10,4 +12,11 @@ struct xt_state_info - { - unsigned int statemask; - }; -+ -+#ifdef CONFIG_COMPAT -+struct compat_xt_state_info -+{ -+ compat_uint_t statemask; -+}; -+#endif - #endif /*_XT_STATE_H*/ -diff -upr linux-2.6.16.orig/include/linux/netfilter.h linux-2.6.16-026test009/include/linux/netfilter.h ---- linux-2.6.16.orig/include/linux/netfilter.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter.h 2006-04-19 15:02:12.000000000 +0400 -@@ -107,12 +107,21 @@ struct nf_info - int nf_register_hook(struct nf_hook_ops *reg); - void nf_unregister_hook(struct nf_hook_ops *reg); - -+int virt_nf_register_hook(struct nf_hook_ops *reg); -+int virt_nf_unregister_hook(struct nf_hook_ops *reg); -+ - /* Functions to register get/setsockopt ranges (non-inclusive). You - need to check permissions yourself! */ - int nf_register_sockopt(struct nf_sockopt_ops *reg); - void nf_unregister_sockopt(struct nf_sockopt_ops *reg); - -+#ifdef CONFIG_VE_IPTABLES -+#define ve_nf_hooks \ -+ ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks)) -+#else - extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; -+#define ve_nf_hooks nf_hooks -+#endif - - /* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will - * disappear once iptables is replaced with pkttables. Please DO NOT use them -@@ -190,7 +199,7 @@ static inline int nf_hook_thresh(int pf, - if (!cond) - return 1; - #ifndef CONFIG_NETFILTER_DEBUG -- if (list_empty(&nf_hooks[pf][hook])) -+ if (list_empty(&ve_nf_hooks[pf][hook])) - return 1; - #endif - return nf_hook_slow(pf, hook, pskb, indev, outdev, okfn, thresh); -diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack.h ---- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack.h 2006-04-19 15:02:12.000000000 +0400 -@@ -71,6 +71,11 @@ do { \ - - struct ip_conntrack_helper; - -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/ve.h> -+#include <linux/ve_owner.h> -+#endif -+ - struct ip_conntrack - { - /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, -@@ -122,8 +127,15 @@ struct ip_conntrack - /* Traversed often, so hopefully in different cacheline to top */ - /* These are my tuples; original and reply */ - struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; -+#ifdef CONFIG_VE_IPTABLES -+ struct ve_struct *ct_owner_env; -+#endif - }; - -+#ifdef CONFIG_VE_IPTABLES -+DCL_VE_OWNER_PROTO(CT, struct ip_conntrack, ct_owner_env) -+#endif -+ - struct ip_conntrack_expect - { - /* Internal linked list (global expectation list) */ -@@ -232,7 +244,15 @@ extern void ip_conntrack_tcp_update(stru - enum ip_conntrack_dir dir); - - /* Call me when a conntrack is destroyed. */ -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_ip_conntrack_destroyed \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_destroyed) -+#else - extern void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack); -+#define ve_ip_conntrack_destroyed ip_conntrack_destroyed -+#endif -+ - - /* Fake conntrack entry for untracked connections */ - extern struct ip_conntrack ip_conntrack_untracked; -@@ -261,7 +281,7 @@ extern void ip_conntrack_proto_put(struc - extern void ip_ct_remove_expectations(struct ip_conntrack *ct); - - extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *, -- struct ip_conntrack_tuple *); -+ struct ip_conntrack_tuple *, struct user_beancounter *); - - extern void ip_conntrack_free(struct ip_conntrack *ct); - -@@ -270,6 +290,8 @@ extern void ip_conntrack_hash_insert(str - extern struct ip_conntrack_expect * - __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); - -+extern void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp); -+ - extern struct ip_conntrack_expect * - ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); - -@@ -291,6 +313,7 @@ static inline int is_dying(struct ip_con - } - - extern unsigned int ip_conntrack_htable_size; -+extern int ip_conntrack_disable_ve0; - - #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) - -@@ -341,6 +364,9 @@ ip_conntrack_event_cache(enum ip_conntra - struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct; - struct ip_conntrack_ecache *ecache; - -+ if (!ve_is_super(get_exec_env())) -+ return; -+ - local_bh_disable(); - ecache = &__get_cpu_var(ip_conntrack_ecache); - if (ct != ecache->ct) -@@ -352,7 +378,7 @@ ip_conntrack_event_cache(enum ip_conntra - static inline void ip_conntrack_event(enum ip_conntrack_events event, - struct ip_conntrack *ct) - { -- if (is_confirmed(ct) && !is_dying(ct)) -+ if (is_confirmed(ct) && !is_dying(ct) && ve_is_super(get_exec_env())) - notifier_call_chain(&ip_conntrack_chain, event, ct); - } - -@@ -360,7 +386,8 @@ static inline void - ip_conntrack_expect_event(enum ip_conntrack_expect_events event, - struct ip_conntrack_expect *exp) - { -- notifier_call_chain(&ip_conntrack_expect_chain, event, exp); -+ if (ve_is_super(get_exec_env())) -+ notifier_call_chain(&ip_conntrack_expect_chain, event, exp); - } - #else /* CONFIG_IP_NF_CONNTRACK_EVENTS */ - static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, -diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_core.h linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack_core.h ---- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-04-19 15:02:12.000000000 +0400 -@@ -3,7 +3,6 @@ - #include <linux/netfilter.h> - - #define MAX_IP_CT_PROTO 256 --extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; - - /* This header is used to share core functionality between the - standalone connection tracking module, and the compatibility layer's use -@@ -54,8 +53,26 @@ static inline int ip_conntrack_confirm(s - - extern void ip_ct_unlink_expect(struct ip_conntrack_expect *exp); - -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_ip_ct_protos \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_protos) -+#define ve_ip_conntrack_hash \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_hash) -+#define ve_ip_conntrack_expect_list \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_expect_list) -+#define ve_ip_conntrack_vmalloc \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_vmalloc) -+#else -+extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; - extern struct list_head *ip_conntrack_hash; - extern struct list_head ip_conntrack_expect_list; -+#define ve_ip_ct_protos ip_ct_protos -+#define ve_ip_conntrack_hash ip_conntrack_hash -+#define ve_ip_conntrack_expect_list ip_conntrack_expect_list -+#define ve_ip_conntrack_vmalloc ip_conntrack_vmalloc -+#endif /* CONFIG_VE_IPTABLES */ -+ - extern rwlock_t ip_conntrack_lock; - #endif /* _IP_CONNTRACK_CORE_H */ - -diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_helper.h linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack_helper.h ---- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-04-19 15:02:12.000000000 +0400 -@@ -31,6 +31,9 @@ struct ip_conntrack_helper - extern int ip_conntrack_helper_register(struct ip_conntrack_helper *); - extern void ip_conntrack_helper_unregister(struct ip_conntrack_helper *); - -+extern int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *); -+extern void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *); -+ - /* Allocate space for an expectation: this is mandatory before calling - ip_conntrack_expect_related. You will have to call put afterwards. */ - extern struct ip_conntrack_expect * -@@ -41,4 +44,5 @@ extern void ip_conntrack_expect_put(stru - extern int ip_conntrack_expect_related(struct ip_conntrack_expect *exp); - extern void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp); - -+extern struct list_head helpers; - #endif /*_IP_CONNTRACK_HELPER_H*/ -diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_irc.h linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack_irc.h ---- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-04-19 15:02:12.000000000 +0400 -@@ -14,16 +14,26 @@ - #ifndef _IP_CONNTRACK_IRC_H - #define _IP_CONNTRACK_IRC_H - -+#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -+ - /* This structure exists only once per master */ - struct ip_ct_irc_master { - }; - - #ifdef __KERNEL__ --extern unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, -- enum ip_conntrack_info ctinfo, -- unsigned int matchoff, -- unsigned int matchlen, -- struct ip_conntrack_expect *exp); -+typedef unsigned int (*ip_nat_helper_irc_hook)(struct sk_buff **, -+ enum ip_conntrack_info, unsigned int, unsigned int, -+ struct ip_conntrack_expect *); -+ -+extern ip_nat_helper_irc_hook ip_nat_irc_hook; -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_ip_nat_irc_hook \ -+ ((ip_nat_helper_irc_hook) \ -+ (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook)) -+#else -+#define ve_ip_nat_irc_hook ip_nat_irc_hook -+#endif - - #define IRC_PORT 6667 - -diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_protocol.h linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack_protocol.h ---- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-04-19 15:02:12.000000000 +0400 -@@ -67,6 +67,7 @@ struct ip_conntrack_protocol - /* Protocol registration. */ - extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto); - extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto); -+ - /* Existing built-in protocols */ - extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp; - extern struct ip_conntrack_protocol ip_conntrack_protocol_udp; -@@ -74,6 +75,41 @@ extern struct ip_conntrack_protocol ip_c - extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; - extern int ip_conntrack_protocol_tcp_init(void); - -+#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) -+#include <linux/sched.h> -+#define ve_ip_ct_tcp_timeouts \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeouts) -+#define ve_ip_ct_udp_timeout \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout) -+#define ve_ip_ct_udp_timeout_stream \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout_stream) -+#define ve_ip_ct_icmp_timeout \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_icmp_timeout) -+#define ve_ip_ct_generic_timeout \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_generic_timeout) -+#define ve_ip_ct_log_invalid \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_log_invalid) -+#define ve_ip_ct_tcp_timeout_max_retrans \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeout_max_retrans) -+#define ve_ip_ct_tcp_loose \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_loose) -+#define ve_ip_ct_tcp_be_liberal \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_be_liberal) -+#define ve_ip_ct_tcp_max_retrans \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_max_retrans) -+#else -+#define ve_ip_ct_tcp_timeouts *tcp_timeouts -+#define ve_ip_ct_udp_timeout ip_ct_udp_timeout -+#define ve_ip_ct_udp_timeout_stream ip_ct_udp_timeout_stream -+#define ve_ip_ct_icmp_timeout ip_ct_icmp_timeout -+#define ve_ip_ct_generic_timeout ip_ct_generic_timeout -+#define ve_ip_ct_log_invalid ip_ct_log_invalid -+#define ve_ip_ct_tcp_timeout_max_retrans ip_ct_tcp_timeout_max_retrans -+#define ve_ip_ct_tcp_loose ip_ct_tcp_loose -+#define ve_ip_ct_tcp_be_liberal ip_ct_tcp_be_liberal -+#define ve_ip_ct_tcp_max_retrans ip_ct_tcp_max_retrans -+#endif -+ - /* Log invalid packets */ - extern unsigned int ip_ct_log_invalid; - -@@ -85,10 +121,10 @@ extern int ip_ct_port_nfattr_to_tuple(st - #ifdef CONFIG_SYSCTL - #ifdef DEBUG_INVALID_PACKETS - #define LOG_INVALID(proto) \ -- (ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) -+ (ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) - #else - #define LOG_INVALID(proto) \ -- ((ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) \ -+ ((ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) \ - && net_ratelimit()) - #endif - #else -diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat.h linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_nat.h ---- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_nat.h 2006-04-19 15:02:12.000000000 +0400 -@@ -1,5 +1,6 @@ - #ifndef _IP_NAT_H - #define _IP_NAT_H -+#include <linux/config.h> - #include <linux/netfilter_ipv4.h> - #include <linux/netfilter_ipv4/ip_conntrack_tuple.h> - -@@ -72,10 +73,29 @@ extern unsigned int ip_nat_setup_info(st - extern int ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack); - -+extern void ip_nat_hash_conntrack(struct ip_conntrack *conntrack); -+ - /* Calculate relative checksum. */ - extern u_int16_t ip_nat_cheat_check(u_int32_t oldvalinv, - u_int32_t newval, - u_int16_t oldcheck); -+ -+#ifdef CONFIG_COMPAT -+#include <net/compat.h> -+ -+struct compat_ip_nat_range -+{ -+ compat_uint_t flags; -+ u_int32_t min_ip, max_ip; -+ union ip_conntrack_manip_proto min, max; -+}; -+ -+struct compat_ip_nat_multi_range -+{ -+ compat_uint_t rangesize; -+ struct compat_ip_nat_range range[1]; -+}; -+#endif - #else /* !__KERNEL__: iptables wants this to compile. */ - #define ip_nat_multi_range ip_nat_multi_range_compat - #endif /*__KERNEL__*/ -diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat_rule.h linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_nat_rule.h ---- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-04-19 15:02:12.000000000 +0400 -@@ -6,7 +6,7 @@ - - #ifdef __KERNEL__ - --extern int ip_nat_rule_init(void) __init; -+extern int ip_nat_rule_init(void); - extern void ip_nat_rule_cleanup(void); - extern int ip_nat_rule_find(struct sk_buff **pskb, - unsigned int hooknum, -diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_tables.h linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_tables.h ---- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_tables.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/netfilter_ipv4/ip_tables.h 2006-04-19 15:02:12.000000000 +0400 -@@ -16,6 +16,7 @@ - #define _IPTABLES_H - - #ifdef __KERNEL__ -+#include <linux/config.h> - #include <linux/if.h> - #include <linux/types.h> - #include <linux/in.h> -@@ -330,7 +331,7 @@ extern void ipt_init(void) __init; - //#define ipt_register_table(tbl, repl) xt_register_table(AF_INET, tbl, repl) - //#define ipt_unregister_table(tbl) xt_unregister_table(AF_INET, tbl) - --extern int ipt_register_table(struct ipt_table *table, -+extern struct ipt_table *ipt_register_table(struct ipt_table *table, - const struct ipt_replace *repl); - extern void ipt_unregister_table(struct ipt_table *table); - -@@ -364,5 +365,62 @@ extern unsigned int ipt_do_table(struct - void *userdata); - - #define IPT_ALIGN(s) XT_ALIGN(s) -+ -+#ifdef CONFIG_COMPAT -+#include <net/compat.h> -+ -+struct compat_ipt_getinfo -+{ -+ char name[IPT_TABLE_MAXNAMELEN]; -+ compat_uint_t valid_hooks; -+ compat_uint_t hook_entry[NF_IP_NUMHOOKS]; -+ compat_uint_t underflow[NF_IP_NUMHOOKS]; -+ compat_uint_t num_entries; -+ compat_uint_t size; -+}; -+ -+struct compat_ipt_entry -+{ -+ struct ipt_ip ip; -+ compat_uint_t nfcache; -+ u_int16_t target_offset; -+ u_int16_t next_offset; -+ compat_uint_t comefrom; -+ struct compat_xt_counters counters; -+ unsigned char elems[0]; -+}; -+ -+struct compat_ipt_entry_match -+{ -+ union { -+ struct { -+ u_int16_t match_size; -+ char name[IPT_FUNCTION_MAXNAMELEN]; -+ } user; -+ u_int16_t match_size; -+ } u; -+ unsigned char data[0]; -+}; -+ -+struct compat_ipt_entry_target -+{ -+ union { -+ struct { -+ u_int16_t target_size; -+ char name[IPT_FUNCTION_MAXNAMELEN]; -+ } user; -+ u_int16_t target_size; -+ } u; -+ unsigned char data[0]; -+}; -+ -+#define COMPAT_IPT_ALIGN(s) COMPAT_XT_ALIGN(s) -+ -+extern int ipt_match_align_compat(void *match, void **dstptr, -+ int *size, int off, int convert); -+extern int ipt_target_align_compat(void *target, void **dstptr, -+ int *size, int off, int convert); -+ -+#endif /* CONFIG_COMPAT */ - #endif /*__KERNEL__*/ - #endif /* _IPTABLES_H */ -diff -upr linux-2.6.16.orig/include/linux/nfcalls.h linux-2.6.16-026test009/include/linux/nfcalls.h ---- linux-2.6.16.orig/include/linux/nfcalls.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/nfcalls.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,233 @@ -+/* -+ * include/linux/nfcalls.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _LINUX_NFCALLS_H -+#define _LINUX_NFCALLS_H -+ -+#include <linux/rcupdate.h> -+ -+#ifdef CONFIG_MODULES -+extern struct module no_module; -+ -+#define DECL_KSYM_MODULE(name) \ -+ extern struct module *vz_mod_##name -+#define DECL_KSYM_CALL(type, name, args) \ -+ extern type (*vz_##name) args -+ -+#define INIT_KSYM_MODULE(name) \ -+ struct module *vz_mod_##name = &no_module; \ -+ EXPORT_SYMBOL(vz_mod_##name) -+#define INIT_KSYM_CALL(type, name, args) \ -+ type (*vz_##name) args; \ -+ EXPORT_SYMBOL(vz_##name) -+ -+#define __KSYMERRCALL(err, type, mod, name, args) \ -+({ \ -+ type ret = (type)err; \ -+ if (!__vzksym_module_get(vz_mod_##mod)) { \ -+ if (vz_##name) \ -+ ret = ((*vz_##name)args); \ -+ __vzksym_module_put(vz_mod_##mod); \ -+ } \ -+ ret; \ -+}) -+#define __KSYMSAFECALL_VOID(mod, name, args) \ -+do { \ -+ if (!__vzksym_module_get(vz_mod_##mod)) { \ -+ if (vz_##name) \ -+ ((*vz_##name)args); \ -+ __vzksym_module_put(vz_mod_##mod); \ -+ } \ -+} while (0) -+#else -+#define DECL_KSYM_CALL(type, name, args) \ -+ extern type name args -+#define INIT_KSYM_MODULE(name) -+#define INIT_KSYM_CALL(type, name, args) \ -+ type name args -+#define __KSYMERRCALL(err, type, mod, name, args) ((*name)args) -+#define __KSYMSAFECALL_VOID(mod, name, args) ((*name)args) -+#endif -+ -+#define KSYMERRCALL(err, mod, name, args) \ -+ __KSYMERRCALL(err, int, mod, name, args) -+#define KSYMSAFECALL(type, mod, name, args) \ -+ __KSYMERRCALL(0, type, mod, name, args) -+#define KSYMSAFECALL_VOID(mod, name, args) \ -+ __KSYMSAFECALL_VOID(mod, name, args) -+ -+#if defined(CONFIG_VE) && defined(CONFIG_MODULES) -+/* should be called _after_ KSYMRESOLVE's */ -+#define KSYMMODRESOLVE(name) \ -+ __vzksym_modresolve(&vz_mod_##name, THIS_MODULE) -+#define KSYMMODUNRESOLVE(name) \ -+ __vzksym_modunresolve(&vz_mod_##name) -+ -+#define KSYMRESOLVE(name) \ -+ vz_##name = &name -+#define KSYMUNRESOLVE(name) \ -+ vz_##name = NULL -+#else -+#define KSYMRESOLVE(name) do { } while (0) -+#define KSYMUNRESOLVE(name) do { } while (0) -+#define KSYMMODRESOLVE(name) do { } while (0) -+#define KSYMMODUNRESOLVE(name) do { } while (0) -+#endif -+ -+#ifdef CONFIG_MODULES -+static inline void __vzksym_modresolve(struct module **modp, struct module *mod) -+{ -+ /* -+ * we want to be sure, that pointer updates are visible first: -+ * 1. wmb() is here only for piece of sure -+ * (note, no rmb() in KSYMSAFECALL) -+ * 2. synchronize_sched() guarantees that updates are visible -+ * on all cpus and allows us to remove rmb() in KSYMSAFECALL -+ */ -+ wmb(); synchronize_sched(); -+ *modp = mod; -+ /* just to be sure, our changes are visible as soon as possible */ -+ wmb(); synchronize_sched(); -+} -+ -+static inline void __vzksym_modunresolve(struct module **modp) -+{ -+ /* -+ * try_module_get() in KSYMSAFECALL should fail at this moment since -+ * THIS_MODULE in in unloading state (we should be called from fini), -+ * no need to syncronize pointers/ve_module updates. -+ */ -+ *modp = &no_module; -+ /* -+ * synchronize_sched() guarantees here that we see -+ * updated module pointer before the module really gets away -+ */ -+ synchronize_sched(); -+} -+ -+static inline int __vzksym_module_get(struct module *mod) -+{ -+ /* -+ * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE -+ * and smp_read_barrier_depends() here... -+ */ -+ smp_read_barrier_depends(); /* for module loading */ -+ if (!try_module_get(mod)) -+ return -EBUSY; -+ -+ return 0; -+} -+ -+static inline void __vzksym_module_put(struct module *mod) -+{ -+ module_put(mod); -+} -+#endif -+ -+#if defined(CONFIG_VE_IPTABLES) -+#ifdef CONFIG_MODULES -+DECL_KSYM_MODULE(x_tables); -+DECL_KSYM_MODULE(xt_tcpudp); -+DECL_KSYM_MODULE(ip_tables); -+DECL_KSYM_MODULE(iptable_filter); -+DECL_KSYM_MODULE(iptable_mangle); -+DECL_KSYM_MODULE(xt_limit); -+DECL_KSYM_MODULE(ipt_multiport); -+DECL_KSYM_MODULE(ipt_tos); -+DECL_KSYM_MODULE(ipt_TOS); -+DECL_KSYM_MODULE(ipt_REJECT); -+DECL_KSYM_MODULE(ipt_TCPMSS); -+DECL_KSYM_MODULE(xt_tcpmss); -+DECL_KSYM_MODULE(ipt_ttl); -+DECL_KSYM_MODULE(ipt_LOG); -+DECL_KSYM_MODULE(xt_length); -+DECL_KSYM_MODULE(ip_conntrack); -+DECL_KSYM_MODULE(ip_conntrack_ftp); -+DECL_KSYM_MODULE(ip_conntrack_irc); -+DECL_KSYM_MODULE(xt_conntrack); -+DECL_KSYM_MODULE(xt_state); -+DECL_KSYM_MODULE(xt_helper); -+DECL_KSYM_MODULE(ip_nat); -+DECL_KSYM_MODULE(iptable_nat); -+DECL_KSYM_MODULE(ip_nat_ftp); -+DECL_KSYM_MODULE(ip_nat_irc); -+DECL_KSYM_MODULE(ipt_REDIRECT); -+#endif -+ -+struct sk_buff; -+ -+DECL_KSYM_CALL(int, init_netfilter, (void)); -+DECL_KSYM_CALL(int, init_xtables, (void)); -+DECL_KSYM_CALL(int, init_xt_tcpudp, (void)); -+DECL_KSYM_CALL(int, init_iptables, (void)); -+DECL_KSYM_CALL(int, init_iptable_filter, (void)); -+DECL_KSYM_CALL(int, init_iptable_mangle, (void)); -+DECL_KSYM_CALL(int, init_xt_limit, (void)); -+DECL_KSYM_CALL(int, init_iptable_multiport, (void)); -+DECL_KSYM_CALL(int, init_iptable_tos, (void)); -+DECL_KSYM_CALL(int, init_iptable_TOS, (void)); -+DECL_KSYM_CALL(int, init_iptable_REJECT, (void)); -+DECL_KSYM_CALL(int, init_iptable_TCPMSS, (void)); -+DECL_KSYM_CALL(int, init_xt_tcpmss, (void)); -+DECL_KSYM_CALL(int, init_iptable_ttl, (void)); -+DECL_KSYM_CALL(int, init_iptable_LOG, (void)); -+DECL_KSYM_CALL(int, init_xt_length, (void)); -+DECL_KSYM_CALL(int, init_iptable_conntrack, (void)); -+DECL_KSYM_CALL(int, init_iptable_ftp, (void)); -+DECL_KSYM_CALL(int, init_iptable_irc, (void)); -+DECL_KSYM_CALL(int, init_xt_conntrack_match, (void)); -+DECL_KSYM_CALL(int, init_xt_state, (void)); -+DECL_KSYM_CALL(int, init_xt_helper, (void)); -+DECL_KSYM_CALL(int, ip_nat_init, (void)); -+DECL_KSYM_CALL(int, init_iptable_nat, (void)); -+DECL_KSYM_CALL(int, init_iptable_nat_ftp, (void)); -+DECL_KSYM_CALL(int, init_iptable_nat_irc, (void)); -+DECL_KSYM_CALL(int, init_iptable_REDIRECT, (void)); -+DECL_KSYM_CALL(void, fini_iptable_nat_irc, (void)); -+DECL_KSYM_CALL(void, fini_iptable_nat_ftp, (void)); -+DECL_KSYM_CALL(void, fini_iptable_nat, (void)); -+DECL_KSYM_CALL(void, ip_nat_cleanup, (void)); -+DECL_KSYM_CALL(void, fini_xt_helper, (void)); -+DECL_KSYM_CALL(void, fini_xt_state, (void)); -+DECL_KSYM_CALL(void, fini_xt_conntrack_match, (void)); -+DECL_KSYM_CALL(void, fini_iptable_irc, (void)); -+DECL_KSYM_CALL(void, fini_iptable_ftp, (void)); -+DECL_KSYM_CALL(void, fini_iptable_conntrack, (void)); -+DECL_KSYM_CALL(void, fini_xt_length, (void)); -+DECL_KSYM_CALL(void, fini_iptable_LOG, (void)); -+DECL_KSYM_CALL(void, fini_iptable_ttl, (void)); -+DECL_KSYM_CALL(void, fini_xt_tcpmss, (void)); -+DECL_KSYM_CALL(void, fini_iptable_TCPMSS, (void)); -+DECL_KSYM_CALL(void, fini_iptable_REJECT, (void)); -+DECL_KSYM_CALL(void, fini_iptable_TOS, (void)); -+DECL_KSYM_CALL(void, fini_iptable_tos, (void)); -+DECL_KSYM_CALL(void, fini_iptable_multiport, (void)); -+DECL_KSYM_CALL(void, fini_xt_limit, (void)); -+DECL_KSYM_CALL(void, fini_iptable_filter, (void)); -+DECL_KSYM_CALL(void, fini_iptable_mangle, (void)); -+DECL_KSYM_CALL(void, fini_iptables, (void)); -+DECL_KSYM_CALL(void, fini_xt_tcpudp, (void)); -+DECL_KSYM_CALL(void, fini_xtables, (void)); -+DECL_KSYM_CALL(void, fini_netfilter, (void)); -+DECL_KSYM_CALL(void, fini_iptable_REDIRECT, (void)); -+ -+DECL_KSYM_CALL(void, ipt_flush_table, (struct xt_table *table)); -+#endif /* CONFIG_VE_IPTABLES */ -+ -+#ifdef CONFIG_VE_CALLS_MODULE -+DECL_KSYM_MODULE(vzmon); -+DECL_KSYM_CALL(int, real_get_device_perms_ve, -+ (int dev_type, dev_t dev, int access_mode)); -+DECL_KSYM_CALL(void, real_do_env_cleanup, (struct ve_struct *env)); -+DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); -+DECL_KSYM_CALL(void, real_update_load_avg_ve, (void)); -+#endif -+ -+#endif /* _LINUX_NFCALLS_H */ -diff -upr linux-2.6.16.orig/include/linux/nfs_fs.h linux-2.6.16-026test009/include/linux/nfs_fs.h ---- linux-2.6.16.orig/include/linux/nfs_fs.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/nfs_fs.h 2006-04-19 15:02:11.000000000 +0400 -@@ -296,7 +296,7 @@ extern struct inode *nfs_fhget(struct su - extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); - extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); - extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); --extern int nfs_permission(struct inode *, int, struct nameidata *); -+extern int nfs_permission(struct inode *, int, struct nameidata *, struct exec_perm *); - extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *); - extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); - extern int nfs_open(struct inode *, struct file *); -diff -upr linux-2.6.16.orig/include/linux/notifier.h linux-2.6.16-026test009/include/linux/notifier.h ---- linux-2.6.16.orig/include/linux/notifier.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/notifier.h 2006-04-19 15:02:12.000000000 +0400 -@@ -27,8 +27,9 @@ extern int notifier_call_chain(struct no - - #define NOTIFY_DONE 0x0000 /* Don't care */ - #define NOTIFY_OK 0x0001 /* Suits me */ -+#define NOTIFY_FAIL 0x0002 /* Reject */ - #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ --#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) /* Bad/Veto action */ -+#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) /* Bad/Veto action */ - /* - * Clean way to return from the notifier and stop further calls. - */ -diff -upr linux-2.6.16.orig/include/linux/page-flags.h linux-2.6.16-026test009/include/linux/page-flags.h ---- linux-2.6.16.orig/include/linux/page-flags.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/page-flags.h 2006-04-19 15:02:11.000000000 +0400 -@@ -74,7 +74,9 @@ - #define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ - #define PG_reclaim 17 /* To be reclaimed asap */ - #define PG_nosave_free 18 /* Free, should not be written */ --#define PG_uncached 19 /* Page has been mapped as uncached */ -+#define PG_buddy 19 /* Page is free, on buddy lists */ -+ -+#define PG_uncached 20 /* Page has been mapped as uncached */ - - /* - * Global page accounting. One instance per CPU. Only unsigned longs are -@@ -319,6 +321,10 @@ extern void __mod_page_state_offset(unsi - #define SetPageNosaveFree(page) set_bit(PG_nosave_free, &(page)->flags) - #define ClearPageNosaveFree(page) clear_bit(PG_nosave_free, &(page)->flags) - -+#define PageBuddy(page) test_bit(PG_buddy, &(page)->flags) -+#define __SetPageBuddy(page) __set_bit(PG_buddy, &(page)->flags) -+#define __ClearPageBuddy(page) __clear_bit(PG_buddy, &(page)->flags) -+ - #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) - #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) - #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) -diff -upr linux-2.6.16.orig/include/linux/pid.h linux-2.6.16-026test009/include/linux/pid.h ---- linux-2.6.16.orig/include/linux/pid.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/pid.h 2006-04-19 15:02:12.000000000 +0400 -@@ -1,6 +1,18 @@ - #ifndef _LINUX_PID_H - #define _LINUX_PID_H - -+#define VPID_BIT 10 -+#define VPID_DIV (1<<VPID_BIT) -+ -+#ifdef CONFIG_VE -+#define __is_virtual_pid(pid) ((pid) & VPID_DIV) -+#define is_virtual_pid(pid) \ -+ (__is_virtual_pid(pid) || ((pid)==1 && !ve_is_super(get_exec_env()))) -+#else -+#define __is_virtual_pid(pid) 0 -+#define is_virtual_pid(pid) 0 -+#endif -+ - enum pid_type - { - PIDTYPE_PID, -@@ -15,6 +27,9 @@ struct pid - /* Try to keep pid_chain in the same cacheline as nr for find_pid */ - int nr; - struct hlist_node pid_chain; -+#ifdef CONFIG_VE -+ int vnr; -+#endif - /* list of pids with the same nr, only one of them is in the hash */ - struct list_head pid_list; - }; -@@ -40,16 +55,89 @@ extern int alloc_pidmap(void); - extern void FASTCALL(free_pidmap(int)); - extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread); - --#define do_each_task_pid(who, type, task) \ -- if ((task = find_task_by_pid_type(type, who))) { \ -+#ifndef CONFIG_VE -+ -+#define vpid_to_pid(pid) (pid) -+#define __vpid_to_pid(pid) (pid) -+#define pid_type_to_vpid(type, pid) (pid) -+#define __pid_type_to_vpid(type, pid) (pid) -+ -+#define comb_vpid_to_pid(pid) (pid) -+#define comb_pid_to_vpid(pid) (pid) -+ -+#else -+ -+struct ve_struct; -+extern void free_vpid(int vpid, struct ve_struct *ve); -+extern int alloc_vpid(int pid, int vpid); -+extern int vpid_to_pid(int pid); -+extern int __vpid_to_pid(int pid); -+extern pid_t pid_type_to_vpid(int type, pid_t pid); -+extern pid_t _pid_type_to_vpid(int type, pid_t pid); -+ -+static inline int comb_vpid_to_pid(int vpid) -+{ -+ int pid = vpid; -+ -+ if (vpid > 0) { -+ pid = vpid_to_pid(vpid); -+ if (unlikely(pid < 0)) -+ return 0; -+ } else if (vpid < 0) { -+ pid = vpid_to_pid(-vpid); -+ if (unlikely(pid < 0)) -+ return 0; -+ pid = -pid; -+ } -+ return pid; -+} -+ -+static inline int comb_pid_to_vpid(int pid) -+{ -+ int vpid = pid; -+ -+ if (pid > 0) { -+ vpid = pid_type_to_vpid(PIDTYPE_PID, pid); -+ if (unlikely(vpid < 0)) -+ return 0; -+ } else if (pid < 0) { -+ vpid = pid_type_to_vpid(PIDTYPE_PGID, -pid); -+ if (unlikely(vpid < 0)) -+ return 0; -+ vpid = -vpid; -+ } -+ return vpid; -+} -+#endif -+ -+#define do_each_task_pid_all(who, type, task) \ -+ if ((task = find_task_by_pid_type_all(type, who))) { \ - prefetch((task)->pids[type].pid_list.next); \ - do { - --#define while_each_task_pid(who, type, task) \ -+#define while_each_task_pid_all(who, type, task) \ - } while (task = pid_task((task)->pids[type].pid_list.next,\ - type), \ - prefetch((task)->pids[type].pid_list.next), \ - hlist_unhashed(&(task)->pids[type].pid_chain)); \ - } \ - -+#ifndef CONFIG_VE -+#define __do_each_task_pid_ve(who, type, task, owner) \ -+ do_each_task_pid_all(who, type, task) -+#define __while_each_task_pid_ve(who, type, task, owner) \ -+ while_each_task_pid_all(who, type, task) -+#else /* CONFIG_VE */ -+#define __do_each_task_pid_ve(who, type, task, owner) \ -+ do_each_task_pid_all(who, type, task) \ -+ if (ve_accessible(VE_TASK_INFO(task)->owner_env, owner)) -+#define __while_each_task_pid_ve(who, type, task, owner) \ -+ while_each_task_pid_all(who, type, task) -+#endif /* CONFIG_VE */ -+ -+#define do_each_task_pid_ve(who, type, task) \ -+ __do_each_task_pid_ve(who, type, task, get_exec_env()); -+#define while_each_task_pid_ve(who, type, task) \ -+ __while_each_task_pid_ve(who, type, task, get_exec_env()); -+ - #endif /* _LINUX_PID_H */ -diff -upr linux-2.6.16.orig/include/linux/proc_fs.h linux-2.6.16-026test009/include/linux/proc_fs.h ---- linux-2.6.16.orig/include/linux/proc_fs.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/proc_fs.h 2006-04-19 15:02:12.000000000 +0400 -@@ -78,7 +78,7 @@ struct kcore_list { - struct vmcore { - struct list_head list; - unsigned long long paddr; -- unsigned long size; -+ unsigned long long size; - loff_t offset; - }; - -@@ -86,8 +86,14 @@ struct vmcore { - - extern struct proc_dir_entry proc_root; - extern struct proc_dir_entry *proc_root_fs; -+#ifdef CONFIG_VE -+#include <linux/sched.h> -+#define proc_net (get_exec_env()->_proc_net) -+#define proc_net_stat (get_exec_env()->_proc_net_stat) -+#else - extern struct proc_dir_entry *proc_net; - extern struct proc_dir_entry *proc_net_stat; -+#endif - extern struct proc_dir_entry *proc_bus; - extern struct proc_dir_entry *proc_root_driver; - extern struct proc_dir_entry *proc_root_kcore; -@@ -98,8 +104,8 @@ extern void proc_misc_init(void); - struct mm_struct; - - struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); --struct dentry *proc_pid_unhash(struct task_struct *p); --void proc_pid_flush(struct dentry *proc_dentry); -+void proc_pid_unhash(struct task_struct *p, struct dentry * [2]); -+void proc_pid_flush(struct dentry *proc_dentry[2]); - int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); - unsigned long task_vsize(struct mm_struct *); - int task_statm(struct mm_struct *, int *, int *, int *, int *); -@@ -107,7 +113,11 @@ char *task_mem(struct mm_struct *, char - - extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, - struct proc_dir_entry *parent); -+extern struct proc_dir_entry *create_proc_glob_entry(const char *name, -+ mode_t mode, -+ struct proc_dir_entry *parent); - extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); -+extern void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent); - - extern struct vfsmount *proc_mnt; - extern int proc_fill_super(struct super_block *,void *,int); -@@ -194,6 +204,15 @@ static inline struct proc_dir_entry *pro - return res; - } - -+static inline struct proc_dir_entry *proc_glob_fops_create(const char *name, -+ mode_t mode, struct file_operations *fops) -+{ -+ struct proc_dir_entry *res = create_proc_glob_entry(name, mode, NULL); -+ if (res) -+ res->proc_fops = fops; -+ return res; -+} -+ - static inline void proc_net_remove(const char *name) - { - remove_proc_entry(name,proc_net); -@@ -206,16 +225,21 @@ static inline void proc_net_remove(const - #define proc_bus NULL - - #define proc_net_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) -+#define proc_glob_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) - #define proc_net_create(name, mode, info) ({ (void)(mode), NULL; }) - static inline void proc_net_remove(const char *name) {} - --static inline struct dentry *proc_pid_unhash(struct task_struct *p) { return NULL; } --static inline void proc_pid_flush(struct dentry *proc_dentry) { } -+static inline struct dentry *proc_pid_unhash(struct task_struct *p, -+ struct dentry *d[2]) { return NULL; } -+static inline void proc_pid_flush(struct dentry *proc_dentry[2]) { } - - static inline struct proc_dir_entry *create_proc_entry(const char *name, - mode_t mode, struct proc_dir_entry *parent) { return NULL; } -+static inline struct proc_dir_entry *create_proc_glob_entry(const char *name, -+ mode_t mode, struct proc_dir_entry *parent) { return NULL; } - - #define remove_proc_entry(name, parent) do {} while (0) -+#define remove_proc_glob_entry(name, parent) do {} while (0) - - static inline struct proc_dir_entry *proc_symlink(const char *name, - struct proc_dir_entry *parent,const char *dest) {return NULL;} -@@ -266,4 +290,18 @@ static inline struct proc_dir_entry *PDE - return PROC_I(inode)->pde; - } - -+static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) -+{ -+ if (de) -+ atomic_inc(&de->count); -+ return de; -+} -+ -+extern void de_put(struct proc_dir_entry *); -+ -+#define LPDE(inode) (PROC_I((inode))->pde) -+#ifdef CONFIG_VE -+#define GPDE(inode) (*(struct proc_dir_entry **)(&(inode)->i_pipe)) -+#endif -+ - #endif /* _LINUX_PROC_FS_H */ -diff -upr linux-2.6.16.orig/include/linux/quota.h linux-2.6.16-026test009/include/linux/quota.h ---- linux-2.6.16.orig/include/linux/quota.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/quota.h 2006-04-19 15:02:12.000000000 +0400 -@@ -37,7 +37,6 @@ - - #include <linux/errno.h> - #include <linux/types.h> --#include <linux/spinlock.h> - - #define __DQUOT_VERSION__ "dquot_6.5.1" - #define __DQUOT_NUM_VERSION__ 6*10000+5*100+1 -@@ -45,8 +44,6 @@ - typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ - typedef __u64 qsize_t; /* Type in which we store sizes */ - --extern spinlock_t dq_data_lock; -- - /* Size of blocks in which are counted size limits */ - #define QUOTABLOCK_BITS 10 - #define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) -@@ -133,6 +130,10 @@ struct if_dqinfo { - - #ifdef __KERNEL__ - -+#include <linux/spinlock.h> -+ -+extern spinlock_t dq_data_lock; -+ - #include <linux/dqblk_xfs.h> - #include <linux/dqblk_v1.h> - #include <linux/dqblk_v2.h> -@@ -242,6 +243,8 @@ struct quota_format_ops { - int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ - }; - -+struct inode; -+struct iattr; - /* Operations working with dquots */ - struct dquot_operations { - int (*initialize) (struct inode *, int); -@@ -256,9 +259,11 @@ struct dquot_operations { - int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ - int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ - int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ -+ int (*rename) (struct inode *, struct inode *, struct inode *); - }; - - /* Operations handling requests from userspace */ -+struct v2_disk_dqblk; - struct quotactl_ops { - int (*quota_on)(struct super_block *, int, int, char *); - int (*quota_off)(struct super_block *, int); -@@ -271,6 +276,9 @@ struct quotactl_ops { - int (*set_xstate)(struct super_block *, unsigned int, int); - int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); - int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); -+#ifdef CONFIG_QUOTA_COMPAT -+ int (*get_quoti)(struct super_block *, int, unsigned int, struct v2_disk_dqblk *); -+#endif - }; - - struct quota_format_type { -@@ -291,6 +299,10 @@ struct quota_info { - struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ - struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ - struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ -+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) -+ struct vz_quota_master *vzdq_master; -+ int vzdq_count; -+#endif - }; - - /* Inline would be better but we need to dereference super_block which is not defined yet */ -diff -upr linux-2.6.16.orig/include/linux/quotaops.h linux-2.6.16-026test009/include/linux/quotaops.h ---- linux-2.6.16.orig/include/linux/quotaops.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/quotaops.h 2006-04-19 15:02:12.000000000 +0400 -@@ -171,6 +171,19 @@ static __inline__ int DQUOT_TRANSFER(str - return 0; - } - -+static __inline__ int DQUOT_RENAME(struct inode *inode, -+ struct inode *old_dir, struct inode *new_dir) -+{ -+ struct dquot_operations *q_op; -+ -+ q_op = inode->i_sb->dq_op; -+ if (q_op && q_op->rename) { -+ if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA) -+ return 1; -+ } -+ return 0; -+} -+ - /* The following two functions cannot be called inside a transaction */ - #define DQUOT_SYNC(sb) sync_dquots(sb, -1) - -@@ -197,6 +210,7 @@ static __inline__ int DQUOT_OFF(struct s - #define DQUOT_SYNC(sb) do { } while(0) - #define DQUOT_OFF(sb) do { } while(0) - #define DQUOT_TRANSFER(inode, iattr) (0) -+#define DQUOT_RENAME(inode, old_dir, new_dir) (0) - static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) - { - inode_add_bytes(inode, nr); -diff -upr linux-2.6.16.orig/include/linux/raid/raid1.h linux-2.6.16-026test009/include/linux/raid/raid1.h ---- linux-2.6.16.orig/include/linux/raid/raid1.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/raid/raid1.h 2006-04-19 15:02:11.000000000 +0400 -@@ -130,6 +130,6 @@ struct r1bio_s { - * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... - */ --#define R1BIO_Returned 4 -+#define R1BIO_Returned 6 - - #endif -diff -upr linux-2.6.16.orig/include/linux/reiserfs_xattr.h linux-2.6.16-026test009/include/linux/reiserfs_xattr.h ---- linux-2.6.16.orig/include/linux/reiserfs_xattr.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/reiserfs_xattr.h 2006-04-19 15:02:11.000000000 +0400 -@@ -42,7 +42,8 @@ int reiserfs_removexattr(struct dentry * - int reiserfs_delete_xattrs(struct inode *inode); - int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); - int reiserfs_xattr_init(struct super_block *sb, int mount_flags); --int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd); -+int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, -+ struct exec_perm *); - - int reiserfs_xattr_del(struct inode *, const char *); - int reiserfs_xattr_get(const struct inode *, const char *, void *, size_t); -diff -upr linux-2.6.16.orig/include/linux/rmap.h linux-2.6.16-026test009/include/linux/rmap.h ---- linux-2.6.16.orig/include/linux/rmap.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/rmap.h 2006-04-19 15:02:12.000000000 +0400 -@@ -74,6 +74,7 @@ void page_add_anon_rmap(struct page *, s - void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); - void page_add_file_rmap(struct page *); - void page_remove_rmap(struct page *); -+struct anon_vma *page_lock_anon_vma(struct page *page); - - /** - * page_dup_rmap - duplicate pte mapping to a page -diff -upr linux-2.6.16.orig/include/linux/rtc.h linux-2.6.16-026test009/include/linux/rtc.h ---- linux-2.6.16.orig/include/linux/rtc.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/rtc.h 2006-04-19 15:02:11.000000000 +0400 -@@ -11,8 +11,6 @@ - #ifndef _LINUX_RTC_H_ - #define _LINUX_RTC_H_ - --#include <linux/interrupt.h> -- - /* - * The struct used to pass data via the following ioctl. Similar to the - * struct tm in <time.h>, but it needs to be here so that the kernel -@@ -95,6 +93,8 @@ struct rtc_pll_info { - - #ifdef __KERNEL__ - -+#include <linux/interrupt.h> -+ - typedef struct rtc_task { - void (*func)(void *private_data); - void *private_data; -diff -upr linux-2.6.16.orig/include/linux/sched.h linux-2.6.16-026test009/include/linux/sched.h ---- linux-2.6.16.orig/include/linux/sched.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/sched.h 2006-04-19 15:02:12.000000000 +0400 -@@ -38,7 +38,10 @@ - - #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ - -+#include <ub/ub_task.h> -+ - struct exec_domain; -+struct ve_struct; - - /* - * cloning flags: -@@ -92,15 +95,34 @@ extern unsigned long avenrun[]; /* Load - load += n*(FIXED_1-exp); \ - load >>= FSHIFT; - -+#define LOAD_INT(x) ((x) >> FSHIFT) -+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) -+ - extern unsigned long total_forks; - extern int nr_threads; - extern int last_pid; - DECLARE_PER_CPU(unsigned long, process_counts); - extern int nr_processes(void); -+ -+extern unsigned long nr_sleeping(void); -+extern unsigned long nr_stopped(void); -+extern unsigned long nr_zombie; -+extern atomic_t nr_dead; - extern unsigned long nr_running(void); - extern unsigned long nr_uninterruptible(void); - extern unsigned long nr_iowait(void); - -+#ifdef CONFIG_VE -+struct ve_struct; -+extern unsigned long nr_running_ve(struct ve_struct *); -+extern unsigned long nr_iowait_ve(struct ve_struct *); -+extern unsigned long nr_uninterruptible_ve(struct ve_struct *); -+#else -+#define nr_running_ve(ve) 0 -+#define nr_iowait_ve(ve) 0 -+#define nr_uninterruptible_ve(ve) 0 -+#endif -+ - #include <linux/time.h> - #include <linux/param.h> - #include <linux/resource.h> -@@ -189,6 +211,7 @@ extern cpumask_t nohz_cpu_mask; - - extern void show_state(void); - extern void show_regs(struct pt_regs *); -+extern void smp_show_regs(struct pt_regs *, void *); - - /* - * TASK is a pointer to the task whose backtrace we want to see (or NULL for current -@@ -252,31 +275,7 @@ arch_get_unmapped_area_topdown(struct fi - extern void arch_unmap_area(struct mm_struct *, unsigned long); - extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); - --#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS --/* -- * The mm counters are not protected by its page_table_lock, -- * so must be incremented atomically. -- */ --#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) --#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) --#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) --#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) --#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) --typedef atomic_long_t mm_counter_t; -- --#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ --/* -- * The mm counters are protected by its page_table_lock, -- * so can be incremented directly. -- */ --#define set_mm_counter(mm, member, value) (mm)->_##member = (value) --#define get_mm_counter(mm, member) ((mm)->_##member) --#define add_mm_counter(mm, member, value) (mm)->_##member += (value) --#define inc_mm_counter(mm, member) (mm)->_##member++ --#define dec_mm_counter(mm, member) (mm)->_##member-- --typedef unsigned long mm_counter_t; -- --#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ -+#include <linux/mm_counter.h> - - #define get_mm_rss(mm) \ - (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) -@@ -332,6 +331,7 @@ struct mm_struct { - unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - - unsigned dumpable:2; -+ unsigned vps_dumpable:1; - cpumask_t cpu_vm_mask; - - /* Architecture-specific MM context */ -@@ -348,6 +348,9 @@ struct mm_struct { - /* aio bits */ - rwlock_t ioctx_list_lock; - struct kioctx *ioctx_list; -+#ifdef CONFIG_USER_RESOURCE -+ struct user_beancounter *mm_ub; -+#endif - }; - - struct sighand_struct { -@@ -364,6 +367,9 @@ static inline void sighand_free(struct s - call_rcu(&sp->rcu, sighand_free_cb); - } - -+#include <linux/ve.h> -+#include <linux/ve_task.h> -+ - /* - * NOTE! "signal_struct" does not have it's own - * locking, because a shared signal_struct always -@@ -846,6 +852,11 @@ struct task_struct { - - unsigned long ptrace_message; - siginfo_t *last_siginfo; /* For ptrace use. */ -+ -+/* state tracking for suspend */ -+ __u8 pn_state; -+ __u8 stopped_state:1; -+ - /* - * current io wait handle: wait queue entry to use for io waits - * If this thread is processing aio, this points at the waitqueue -@@ -871,6 +882,16 @@ struct task_struct { - #endif - atomic_t fs_excl; /* holding fs exclusive resources */ - struct rcu_head rcu; -+#ifdef CONFIG_USER_RESOURCE -+ struct task_beancounter task_bc; -+#endif -+#ifdef CONFIG_VE -+ struct ve_task_info ve_task_info; -+#endif -+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) -+ unsigned long magic; -+ struct inode *ino; -+#endif - }; - - static inline pid_t process_group(struct task_struct *tsk) -@@ -929,6 +950,43 @@ static inline void put_task_struct(struc - #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ - #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ - -+#ifndef CONFIG_VE -+#define set_pn_state(tsk, state) do { } while(0) -+#define clear_pn_state(tsk) do { } while(0) -+#define set_stop_state(tsk) do { } while(0) -+#define clear_stop_state(tsk) do { } while(0) -+#else -+#define PN_STOP_TF 1 /* was not in 2.6.8 */ -+#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */ -+#define PN_STOP_ENTRY 3 -+#define PN_STOP_FORK 4 -+#define PN_STOP_VFORK 5 -+#define PN_STOP_SIGNAL 6 -+#define PN_STOP_EXIT 7 -+#define PN_STOP_EXEC 8 -+#define PN_STOP_LEAVE 9 -+ -+static inline void set_pn_state(struct task_struct *tsk, int state) -+{ -+ tsk->pn_state = state; -+} -+ -+static inline void clear_pn_state(struct task_struct *tsk) -+{ -+ tsk->pn_state = 0; -+} -+ -+static inline void set_stop_state(struct task_struct *tsk) -+{ -+ tsk->stopped_state = 1; -+} -+ -+static inline void clear_stop_state(struct task_struct *tsk) -+{ -+ tsk->stopped_state = 0; -+} -+#endif -+ - /* - * Only the _current_ task can read/write to tsk->flags, but other - * tasks can access tsk->flags in readonly mode for example -@@ -968,6 +1026,21 @@ static inline int set_cpus_allowed(task_ - extern unsigned long long sched_clock(void); - extern unsigned long long current_sched_time(const task_t *current_task); - -+static inline unsigned long cycles_to_clocks(cycles_t cycles) -+{ -+ extern unsigned long cycles_per_clock; -+ do_div(cycles, cycles_per_clock); -+ return cycles; -+} -+ -+static inline u64 cycles_to_jiffies(cycles_t cycles) -+{ -+ extern unsigned long cycles_per_jiffy; -+ do_div(cycles, cycles_per_jiffy); -+ return cycles; -+} -+ -+ - /* sched_exec is called by processes performing an exec */ - #ifdef CONFIG_SMP - extern void sched_exec(void); -@@ -1020,12 +1093,227 @@ extern struct task_struct init_task; - - extern struct mm_struct init_mm; - --#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) --extern struct task_struct *find_task_by_pid_type(int type, int pid); -+#define find_task_by_pid_all(nr) \ -+ find_task_by_pid_type_all(PIDTYPE_PID, nr) -+extern struct task_struct *find_task_by_pid_type_all(int type, int pid); - extern void set_special_pids(pid_t session, pid_t pgrp); - extern void __set_special_pids(pid_t session, pid_t pgrp); - -+#ifndef CONFIG_VE -+#define find_task_by_pid_ve find_task_by_pid_all -+ -+#define get_exec_env() ((struct ve_struct *)NULL) -+#define set_exec_env(new_env) ((struct ve_struct *)NULL) -+ -+#define ve_is_super(env) 1 -+#define ve_accessible(target, owner) 1 -+#define ve_accessible_strict(target, owner) 1 -+#define ve_accessible_veid(target, owner) 1 -+#define ve_accessible_strict_veid(target, owner) 1 -+ -+#define VEID(envid) 0 -+#define get_ve0() NULL -+ -+static inline pid_t virt_pid(struct task_struct *tsk) -+{ -+ return tsk->pid; -+} -+ -+static inline pid_t virt_tgid(struct task_struct *tsk) -+{ -+ return tsk->tgid; -+} -+ -+static inline pid_t virt_pgid(struct task_struct *tsk) -+{ -+ return tsk->signal->pgrp; -+} -+ -+static inline pid_t virt_sid(struct task_struct *tsk) -+{ -+ return tsk->signal->session; -+} -+ -+#define get_task_pid_ve(tsk, ve) get_task_pid(tsk) -+ -+static inline pid_t get_task_pid(struct task_struct *tsk) -+{ -+ return tsk->pid; -+} -+ -+static inline pid_t get_task_tgid(struct task_struct *tsk) -+{ -+ return tsk->tgid; -+} -+ -+static inline pid_t get_task_pgid(struct task_struct *tsk) -+{ -+ return tsk->signal->pgrp; -+} -+ -+static inline pid_t get_task_sid(struct task_struct *tsk) -+{ -+ return tsk->signal->session; -+} -+ -+static inline void set_virt_pid(struct task_struct *tsk, pid_t pid) -+{ -+} -+ -+static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid) -+{ -+} -+ -+static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid) -+{ -+} -+ -+static inline void set_virt_sid(struct task_struct *tsk, pid_t pid) -+{ -+} -+ -+static inline pid_t get_task_ppid(struct task_struct *p) -+{ -+ return pid_alive(p) ? p->group_leader->real_parent->tgid : 0; -+} -+ -+#else /* CONFIG_VE */ -+ -+#include <asm/current.h> -+#include <linux/ve.h> -+ -+extern struct ve_struct ve0; -+ -+#define find_task_by_pid_ve(nr) \ -+ find_task_by_pid_type_ve(PIDTYPE_PID, nr) -+ -+extern struct task_struct *find_task_by_pid_type_ve(int type, int pid); -+ -+#define get_ve0() (&ve0) -+#define VEID(envid) ((envid)->veid) -+ -+#define get_exec_env() (VE_TASK_INFO(current)->exec_env) -+static inline struct ve_struct *set_exec_env(struct ve_struct *new_env) -+{ -+ struct ve_struct *old_env; -+ -+ old_env = VE_TASK_INFO(current)->exec_env; -+ VE_TASK_INFO(current)->exec_env = new_env; -+ -+ return old_env; -+} -+ -+#define ve_is_super(env) ((env) == get_ve0()) -+#define ve_accessible_strict(target, owner) ((target) == (owner)) -+static inline int ve_accessible(struct ve_struct *target, -+ struct ve_struct *owner) { -+ return ve_is_super(owner) || ve_accessible_strict(target, owner); -+} -+ -+#define ve_accessible_strict_veid(target, owner) ((target) == (owner)) -+static inline int ve_accessible_veid(envid_t target, envid_t owner) -+{ -+ return get_ve0()->veid == owner || -+ ve_accessible_strict_veid(target, owner); -+} -+ -+static inline pid_t virt_pid(struct task_struct *tsk) -+{ -+ return tsk->pids[PIDTYPE_PID].vnr; -+} -+ -+static inline pid_t virt_tgid(struct task_struct *tsk) -+{ -+ return tsk->pids[PIDTYPE_TGID].vnr; -+} -+ -+static inline pid_t virt_pgid(struct task_struct *tsk) -+{ -+ return tsk->pids[PIDTYPE_PGID].vnr; -+} -+ -+static inline pid_t virt_sid(struct task_struct *tsk) -+{ -+ return tsk->pids[PIDTYPE_SID].vnr; -+} -+ -+static inline pid_t get_task_pid_ve(struct task_struct *tsk, struct ve_struct *env) -+{ -+ return ve_is_super(env) ? tsk->pid : virt_pid(tsk); -+} -+ -+static inline pid_t get_task_pid(struct task_struct *tsk) -+{ -+ return get_task_pid_ve(tsk, get_exec_env()); -+} -+ -+static inline pid_t get_task_tgid(struct task_struct *tsk) -+{ -+ return ve_is_super(get_exec_env()) ? tsk->tgid : virt_tgid(tsk); -+} -+ -+static inline pid_t get_task_pgid(struct task_struct *tsk) -+{ -+ return ve_is_super(get_exec_env()) ? tsk->signal->pgrp : virt_pgid(tsk); -+} -+ -+static inline pid_t get_task_sid(struct task_struct *tsk) -+{ -+ return ve_is_super(get_exec_env()) ? tsk->signal->session : virt_sid(tsk); -+} -+ -+static inline void set_virt_pid(struct task_struct *tsk, pid_t pid) -+{ -+ tsk->pids[PIDTYPE_PID].vnr = pid; -+} -+ -+static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid) -+{ -+ tsk->pids[PIDTYPE_TGID].vnr = pid; -+} -+ -+static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid) -+{ -+ tsk->pids[PIDTYPE_PGID].vnr = pid; -+} -+ -+static inline void set_virt_sid(struct task_struct *tsk, pid_t pid) -+{ -+ tsk->pids[PIDTYPE_SID].vnr = pid; -+} -+ -+static inline pid_t get_task_ppid(struct task_struct *p) -+{ -+ struct task_struct *parent; -+ struct ve_struct *env; -+ -+ if (!pid_alive(p)) -+ return 0; -+ env = get_exec_env(); -+ if (get_task_pid_ve(p, env) == 1) -+ return 0; -+ parent = p->group_leader->real_parent; -+ return ve_accessible(VE_TASK_INFO(parent)->owner_env, env) ? -+ get_task_tgid(parent) : 1; -+} -+ -+void ve_sched_get_cpu_stat(struct ve_struct *envid, cycles_t *idle, -+ cycles_t *strv, unsigned int cpu); -+void ve_sched_attach(struct ve_struct *envid); -+ -+#endif /* CONFIG_VE */ -+ -+ -+#ifdef CONFIG_VE -+extern cycles_t ve_sched_get_idle_time(struct ve_struct *, int); -+extern cycles_t ve_sched_get_iowait_time(struct ve_struct *, int); -+#else -+#define ve_sched_get_idle_time(ve, cpu) 0 -+#define ve_sched_get_iowait_time(ve, cpu) 0 -+#endif -+ - /* per-UID process charging. */ -+extern int set_user(uid_t new_ruid, int dumpclear); - extern struct user_struct * alloc_uid(uid_t); - static inline struct user_struct *get_uid(struct user_struct *u) - { -@@ -1161,6 +1449,13 @@ extern task_t *child_reaper; - - extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); - extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); -+extern long do_fork_pid(unsigned long clone_flags, -+ unsigned long stack_start, -+ struct pt_regs *regs, -+ unsigned long stack_size, -+ int __user *parent_tidptr, -+ int __user *child_tidptr, -+ long pid0); - task_t *fork_idle(int); - - extern void set_task_comm(struct task_struct *tsk, char *from); -@@ -1187,22 +1482,100 @@ extern void wait_task_inactive(task_t * - add_parent(p, (p)->parent); \ - } while (0) - --#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) --#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) -+#define next_task_all(p) list_entry((p)->tasks.next, struct task_struct, tasks) -+#define prev_task_all(p) list_entry((p)->tasks.prev, struct task_struct, tasks) - --#define for_each_process(p) \ -- for (p = &init_task ; (p = next_task(p)) != &init_task ; ) -+#define for_each_process_all(p) \ -+ for (p = &init_task ; (p = next_task_all(p)) != &init_task ; ) - - /* - * Careful: do_each_thread/while_each_thread is a double loop so - * 'break' will not work as expected - use goto instead. - */ --#define do_each_thread(g, t) \ -- for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do -+#define do_each_thread_all(g, t) \ -+ for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do - --#define while_each_thread(g, t) \ -+#define while_each_thread_all(g, t) \ - while ((t = next_thread(t)) != g) - -+#ifndef CONFIG_VE -+ -+#define SET_VE_LINKS(p) -+#define REMOVE_VE_LINKS(p) -+#define for_each_process_ve(p) for_each_process_all(p) -+#define do_each_thread_ve(g, t) do_each_thread_all(g, t) -+#define while_each_thread_ve(g, t) while_each_thread_all(g, t) -+#define first_task_ve() next_task_ve(&init_task) -+#define __first_task_ve(owner) next_task_ve(&init_task) -+#define __next_task_ve(owner, p) next_task_ve(p) -+#define next_task_ve(p) \ -+ (next_task_all(p) != &init_task ? next_task_all(p) : NULL) -+ -+#else /* CONFIG_VE */ -+ -+#define SET_VE_LINKS(p) \ -+ do { \ -+ if (thread_group_leader(p)) \ -+ list_add_tail(&VE_TASK_INFO(p)->vetask_list, \ -+ &VE_TASK_INFO(p)->owner_env->vetask_lh); \ -+ } while (0) -+ -+#define REMOVE_VE_LINKS(p) \ -+ do { \ -+ if (thread_group_leader(p)) \ -+ list_del(&VE_TASK_INFO(p)->vetask_list); \ -+ } while(0) -+ -+static inline task_t* __first_task_ve(struct ve_struct *ve) -+{ -+ task_t *tsk; -+ -+ if (unlikely(ve_is_super(ve))) { -+ tsk = next_task_all(&init_task); -+ if (tsk == &init_task) -+ tsk = NULL; -+ } else { -+ /* probably can return ve->init_entry, but it's more clear */ -+ BUG_ON(list_empty(&ve->vetask_lh)); -+ tsk = VE_TASK_LIST_2_TASK(ve->vetask_lh.next); -+ } -+ return tsk; -+} -+ -+static inline task_t* __next_task_ve(struct ve_struct *ve, task_t *tsk) -+{ -+ if (unlikely(ve_is_super(ve))) { -+ tsk = next_task_all(tsk); -+ if (tsk == &init_task) -+ tsk = NULL; -+ } else { -+ struct list_head *tmp; -+ -+ BUG_ON(VE_TASK_INFO(tsk)->owner_env != ve); -+ tmp = VE_TASK_INFO(tsk)->vetask_list.next; -+ if (tmp == &ve->vetask_lh) -+ tsk = NULL; -+ else -+ tsk = VE_TASK_LIST_2_TASK(tmp); -+ } -+ return tsk; -+} -+ -+#define first_task_ve() __first_task_ve(get_exec_env()) -+#define next_task_ve(p) __next_task_ve(get_exec_env(), p) -+/* no one uses prev_task_ve(), copy next_task_ve() if needed */ -+ -+#define for_each_process_ve(p) \ -+ for (p = first_task_ve(); p != NULL ; p = next_task_ve(p)) -+ -+#define do_each_thread_ve(g, t) \ -+ for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do -+ -+#define while_each_thread_ve(g, t) \ -+ while ((t = next_thread(t)) != g) -+ -+#endif /* CONFIG_VE */ -+ - extern task_t * FASTCALL(next_thread(const task_t *p)); - - #define thread_group_leader(p) (p->pid == p->tgid) -@@ -1401,7 +1774,7 @@ static inline int frozen(struct task_str - */ - static inline int freezing(struct task_struct *p) - { -- return p->flags & PF_FREEZE; -+ return test_tsk_thread_flag(p, TIF_FREEZE); - } - - /* -@@ -1410,7 +1783,7 @@ static inline int freezing(struct task_s - */ - static inline void freeze(struct task_struct *p) - { -- p->flags |= PF_FREEZE; -+ set_tsk_thread_flag(p, TIF_FREEZE); - } - - /* -@@ -1431,7 +1804,8 @@ static inline int thaw_process(struct ta - */ - static inline void frozen_process(struct task_struct *p) - { -- p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN; -+ clear_tsk_thread_flag(p, TIF_FREEZE); -+ p->flags |= PF_FROZEN; - } - - extern void refrigerator(void); -diff -upr linux-2.6.16.orig/include/linux/sem.h linux-2.6.16-026test009/include/linux/sem.h ---- linux-2.6.16.orig/include/linux/sem.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/sem.h 2006-04-19 15:02:12.000000000 +0400 -@@ -155,6 +155,9 @@ static inline void exit_sem(struct task_ - } - #endif - -+int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg); -+int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg); -+ - #endif /* __KERNEL__ */ - - #endif /* _LINUX_SEM_H */ -diff -upr linux-2.6.16.orig/include/linux/shm.h linux-2.6.16-026test009/include/linux/shm.h ---- linux-2.6.16.orig/include/linux/shm.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/shm.h 2006-04-19 15:02:12.000000000 +0400 -@@ -86,6 +86,7 @@ struct shmid_kernel /* private to the ke - pid_t shm_cprid; - pid_t shm_lprid; - struct user_struct *mlock_user; -+ struct ipc_ids *_shm_ids; - }; - - /* shm_mode upper byte flags */ -@@ -104,6 +105,9 @@ static inline long do_shmat(int shmid, c - } - #endif - -+int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg); -+struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg); -+ - #endif /* __KERNEL__ */ - - #endif /* _LINUX_SHM_H_ */ -diff -upr linux-2.6.16.orig/include/linux/shmem_fs.h linux-2.6.16-026test009/include/linux/shmem_fs.h ---- linux-2.6.16.orig/include/linux/shmem_fs.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/shmem_fs.h 2006-04-19 15:02:11.000000000 +0400 -@@ -19,6 +19,9 @@ struct shmem_inode_info { - swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ - struct list_head swaplist; /* chain of maybes on swap */ - struct inode vfs_inode; -+#ifdef CONFIG_USER_RESOURCE -+ struct user_beancounter *shmi_ub; -+#endif - }; - - struct shmem_sb_info { -diff -upr linux-2.6.16.orig/include/linux/signal.h linux-2.6.16-026test009/include/linux/signal.h ---- linux-2.6.16.orig/include/linux/signal.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/signal.h 2006-04-19 15:02:12.000000000 +0400 -@@ -3,6 +3,7 @@ - - #include <linux/list.h> - #include <linux/spinlock.h> -+#include <linux/slab.h> - #include <asm/signal.h> - #include <asm/siginfo.h> - -@@ -41,6 +42,9 @@ struct sigqueue { - int flags; - siginfo_t info; - struct user_struct *user; -+#ifdef CONFIG_USER_RESOURCE -+ struct user_beancounter *sig_ub; -+#endif - }; - - /* flags values. */ -@@ -263,6 +267,8 @@ extern int sigprocmask(int, sigset_t *, - struct pt_regs; - extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); - -+extern kmem_cache_t *sigqueue_cachep; -+ - #endif /* __KERNEL__ */ - - #endif /* _LINUX_SIGNAL_H */ -diff -upr linux-2.6.16.orig/include/linux/skbuff.h linux-2.6.16-026test009/include/linux/skbuff.h ---- linux-2.6.16.orig/include/linux/skbuff.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/skbuff.h 2006-04-19 15:02:12.000000000 +0400 -@@ -19,6 +19,7 @@ - #include <linux/compiler.h> - #include <linux/time.h> - #include <linux/cache.h> -+#include <linux/ve_owner.h> - - #include <asm/atomic.h> - #include <asm/types.h> -@@ -211,6 +212,8 @@ enum { - * @tc_verd: traffic control verdict - */ - -+#include <ub/ub_sk.h> -+ - struct sk_buff { - /* These two members must be first. */ - struct sk_buff *next; -@@ -294,13 +297,18 @@ struct sk_buff { - *data, - *tail, - *end; -+ struct skb_beancounter skb_bc; -+ struct ve_struct *owner_env; - }; - -+DCL_VE_OWNER_PROTO(SKB, struct sk_buff, owner_env) -+ - #ifdef __KERNEL__ - /* - * Handling routines are only of interest to the kernel - */ - #include <linux/slab.h> -+#include <ub/ub_net.h> - - #include <asm/system.h> - -@@ -1007,6 +1015,8 @@ static inline int pskb_trim(struct sk_bu - */ - static inline void skb_orphan(struct sk_buff *skb) - { -+ ub_skb_uncharge(skb); -+ - if (skb->destructor) - skb->destructor(skb); - skb->destructor = NULL; -diff -upr linux-2.6.16.orig/include/linux/slab.h linux-2.6.16-026test009/include/linux/slab.h ---- linux-2.6.16.orig/include/linux/slab.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/slab.h 2006-04-19 15:02:11.000000000 +0400 -@@ -48,6 +48,26 @@ typedef struct kmem_cache kmem_cache_t; - #define SLAB_PANIC 0x00040000UL /* panic if kmem_cache_create() fails */ - #define SLAB_DESTROY_BY_RCU 0x00080000UL /* defer freeing pages to RCU */ - -+/* -+ * allocation rules: __GFP_UBC 0 -+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ * cache (SLAB_UBC) charge charge -+ * (usual caches: mm, vma, task_struct, ...) -+ * -+ * cache (SLAB_UBC | SLAB_NO_CHARGE) charge --- -+ * (ub_kmalloc) (kmalloc) -+ * -+ * cache (no UB flags) BUG() --- -+ * (nonub caches, mempools) -+ * -+ * pages charge --- -+ * (ub_vmalloc, (vmalloc, -+ * poll, fdsets, ...) non-ub allocs) -+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ */ -+#define SLAB_UBC 0x20000000UL /* alloc space for ubs ... */ -+#define SLAB_NO_CHARGE 0x40000000UL /* ... but don't charge */ -+ - /* flags passed to a constructor func */ - #define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */ - #define SLAB_CTOR_ATOMIC 0x002UL /* tell constructor it can't sleep */ -diff -upr linux-2.6.16.orig/include/linux/smp.h linux-2.6.16-026test009/include/linux/smp.h ---- linux-2.6.16.orig/include/linux/smp.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/smp.h 2006-04-19 15:02:11.000000000 +0400 -@@ -10,6 +10,9 @@ - - extern void cpu_idle(void); - -+struct pt_regs; -+typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info); -+ - #ifdef CONFIG_SMP - - #include <linux/preempt.h> -@@ -49,6 +52,8 @@ extern int __cpu_up(unsigned int cpunum) - */ - extern void smp_cpus_done(unsigned int max_cpus); - -+extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait); -+ - /* - * Call a function on all other processors - */ -@@ -99,6 +104,12 @@ static inline void smp_send_reschedule(i - #define num_booting_cpus() 1 - #define smp_prepare_boot_cpu() do {} while (0) - -+static inline int smp_nmi_call_function(smp_nmi_function func, -+ void *info, int wait) -+{ -+ return 0; -+} -+ - #endif /* !SMP */ - - /* -diff -upr linux-2.6.16.orig/include/linux/socket.h linux-2.6.16-026test009/include/linux/socket.h ---- linux-2.6.16.orig/include/linux/socket.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/socket.h 2006-04-19 15:02:12.000000000 +0400 -@@ -300,6 +300,7 @@ extern int memcpy_toiovec(struct iovec * - extern int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen); - extern int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr); - extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); -+extern int vz_security_proto_check(int family, int type, int protocol); - - #endif - #endif /* not kernel and not glibc */ -diff -upr linux-2.6.16.orig/include/linux/swap.h linux-2.6.16-026test009/include/linux/swap.h ---- linux-2.6.16.orig/include/linux/swap.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/swap.h 2006-04-19 15:02:12.000000000 +0400 -@@ -80,6 +80,7 @@ struct address_space; - struct sysinfo; - struct writeback_control; - struct zone; -+struct user_beancounter; - - /* - * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of -@@ -119,6 +120,7 @@ enum { - /* - * The in-memory structure used to track swap areas. - */ -+struct user_beancounter; - struct swap_info_struct { - unsigned int flags; - int prio; /* swap priority */ -@@ -136,6 +138,9 @@ struct swap_info_struct { - unsigned int max; - unsigned int inuse_pages; - int next; /* next entry on swap list */ -+#ifdef CONFIG_USER_SWAP_ACCOUNTING -+ struct user_beancounter **swap_ubs; -+#endif - }; - - struct swap_list_t { -@@ -240,7 +245,7 @@ extern long total_swap_pages; - extern unsigned int nr_swapfiles; - extern struct swap_info_struct swap_info[]; - extern void si_swapinfo(struct sysinfo *); --extern swp_entry_t get_swap_page(void); -+extern swp_entry_t get_swap_page(struct user_beancounter *); - extern swp_entry_t get_swap_page_of_type(int type); - extern int swap_duplicate(swp_entry_t); - extern int valid_swaphandles(swp_entry_t, unsigned long *); -@@ -253,7 +258,9 @@ extern int remove_exclusive_swap_page(st - struct backing_dev_info; - - extern spinlock_t swap_lock; --extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page); -+struct page_beancounter; -+extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page, -+ struct page_beancounter **pb); - - /* linux/mm/thrash.c */ - extern struct mm_struct * swap_token_mm; -@@ -310,7 +317,7 @@ static inline int remove_exclusive_swap_ - return 0; - } - --static inline swp_entry_t get_swap_page(void) -+static inline swp_entry_t get_swap_page(struct user_beancounter *ub) - { - swp_entry_t entry; - entry.val = 0; -diff -upr linux-2.6.16.orig/include/linux/sysctl.h linux-2.6.16-026test009/include/linux/sysctl.h ---- linux-2.6.16.orig/include/linux/sysctl.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/sysctl.h 2006-04-19 15:02:12.000000000 +0400 -@@ -148,6 +148,8 @@ enum - KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ - KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ - KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ -+ KERN_VIRT_PIDS=202, /* int: VE pids virtualization */ -+ KERN_VIRT_OSRELEASE=205,/* virtualization of utsname.release */ - }; - - -@@ -401,6 +403,7 @@ enum - - enum { - NET_IPV4_ROUTE_FLUSH=1, -+ NET_IPV4_ROUTE_SRC_CHECK=188, - NET_IPV4_ROUTE_MIN_DELAY=2, - NET_IPV4_ROUTE_MAX_DELAY=3, - NET_IPV4_ROUTE_GC_THRESH=4, -@@ -760,6 +763,12 @@ enum - FS_AIO_NR=18, /* current system-wide number of aio requests */ - FS_AIO_MAX_NR=19, /* system-wide maximum number of aio requests */ - FS_INOTIFY=20, /* inotify submenu */ -+ FS_AT_VSYSCALL=21, /* int: to announce vsyscall data */ -+}; -+ -+/* /proc/sys/debug */ -+enum { -+ DBG_DECODE_CALLTRACES = 1, /* int: decode call traces on oops */ - }; - - /* /proc/sys/fs/quota/ */ -@@ -900,6 +909,8 @@ extern int proc_doulongvec_minmax(ctl_ta - void __user *, size_t *, loff_t *); - extern int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int, - struct file *, void __user *, size_t *, loff_t *); -+extern int proc_doutsstring(ctl_table *table, int write, struct file *, -+ void __user *, size_t *, loff_t *); - - extern int do_sysctl (int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, -@@ -954,6 +965,8 @@ extern ctl_handler sysctl_ms_jiffies; - */ - - /* A sysctl table is an array of struct ctl_table: */ -+struct ve_struct; -+ - struct ctl_table - { - int ctl_name; /* Binary ID */ -@@ -967,6 +980,7 @@ struct ctl_table - struct proc_dir_entry *de; /* /proc control block */ - void *extra1; - void *extra2; -+ struct ve_struct *owner_env; - }; - - /* struct ctl_table_header is used to maintain dynamic lists of -@@ -983,6 +997,9 @@ struct ctl_table_header * register_sysct - int insert_at_head); - void unregister_sysctl_table(struct ctl_table_header * table); - -+ctl_table *clone_sysctl_template(ctl_table *tmpl, int nr); -+void free_sysctl_clone(ctl_table *clone); -+ - #else /* __KERNEL__ */ - - #endif /* __KERNEL__ */ -diff -upr linux-2.6.16.orig/include/linux/tty.h linux-2.6.16-026test009/include/linux/tty.h ---- linux-2.6.16.orig/include/linux/tty.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/tty.h 2006-04-19 15:02:12.000000000 +0400 -@@ -238,8 +238,11 @@ struct tty_struct { - spinlock_t read_lock; - /* If the tty has a pending do_SAK, queue it here - akpm */ - struct work_struct SAK_work; -+ struct ve_struct *owner_env; - }; - -+DCL_VE_OWNER_PROTO(TTY, struct tty_struct, owner_env) -+ - /* tty magic number */ - #define TTY_MAGIC 0x5401 - -@@ -266,6 +269,7 @@ struct tty_struct { - #define TTY_PTY_LOCK 16 /* pty private */ - #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ - #define TTY_HUPPED 18 /* Post driver->hangup() */ -+#define TTY_CHARGED 19 /* Charged as ub resource */ - - #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) - -diff -upr linux-2.6.16.orig/include/linux/tty_driver.h linux-2.6.16-026test009/include/linux/tty_driver.h ---- linux-2.6.16.orig/include/linux/tty_driver.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/tty_driver.h 2006-04-19 15:02:12.000000000 +0400 -@@ -115,6 +115,7 @@ - * character to the device. - */ - -+#include <linux/ve_owner.h> - #include <linux/fs.h> - #include <linux/list.h> - #include <linux/cdev.h> -@@ -214,9 +215,18 @@ struct tty_driver { - unsigned int set, unsigned int clear); - - struct list_head tty_drivers; -+ struct ve_struct *owner_env; - }; - -+DCL_VE_OWNER_PROTO(TTYDRV, struct tty_driver, owner_env) -+ -+#ifdef CONFIG_LEGACY_PTYS -+extern struct tty_driver *pty_driver; -+extern struct tty_driver *pty_slave_driver; -+#endif -+ - extern struct list_head tty_drivers; -+extern rwlock_t tty_driver_guard; - - struct tty_driver *alloc_tty_driver(int lines); - void put_tty_driver(struct tty_driver *driver); -diff -upr linux-2.6.16.orig/include/linux/ve.h linux-2.6.16-026test009/include/linux/ve.h ---- linux-2.6.16.orig/include/linux/ve.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/ve.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,313 @@ -+/* -+ * include/linux/ve.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _LINUX_VE_H -+#define _LINUX_VE_H -+ -+#include <linux/config.h> -+ -+#ifndef __ENVID_T_DEFINED__ -+typedef unsigned envid_t; -+#define __ENVID_T_DEFINED__ -+#endif -+ -+#include <linux/types.h> -+#include <linux/capability.h> -+#include <linux/utsname.h> -+#include <linux/sysctl.h> -+#include <linux/vzstat.h> -+#include <linux/kobject.h> -+ -+#ifdef VZMON_DEBUG -+# define VZTRACE(fmt,args...) \ -+ printk(KERN_DEBUG fmt, ##args) -+#else -+# define VZTRACE(fmt,args...) -+#endif /* VZMON_DEBUG */ -+ -+struct tty_driver; -+struct devpts_config; -+struct task_struct; -+struct new_utsname; -+struct file_system_type; -+struct icmp_mib; -+struct ip_mib; -+struct tcp_mib; -+struct udp_mib; -+struct linux_mib; -+struct fib_info; -+struct fib_rule; -+struct veip_struct; -+struct ve_monitor; -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+struct fib_table; -+struct devcnfv4_struct; -+#ifdef CONFIG_VE_IPTABLES -+struct xt_af; -+struct xt_table; -+struct xt_target; -+struct ip_conntrack; -+typedef unsigned int (*ip_nat_helper_func)(void); -+struct ve_ip_conntrack { -+ struct list_head *_ip_conntrack_hash; -+ struct list_head _ip_conntrack_expect_list; -+ struct list_head _ip_conntrack_unconfirmed; -+ struct ip_conntrack_protocol ** _ip_ct_protos; -+ struct list_head _ip_conntrack_helpers; -+ int _ip_conntrack_max; -+ int _ip_conntrack_vmalloc; -+ atomic_t _ip_conntrack_count; -+ void (*_ip_conntrack_destroyed)(struct ip_conntrack *conntrack); -+#ifdef CONFIG_SYSCTL -+ unsigned long _ip_ct_tcp_timeouts[10]; -+ unsigned long _ip_ct_udp_timeout; -+ unsigned long _ip_ct_udp_timeout_stream; -+ unsigned long _ip_ct_icmp_timeout; -+ unsigned long _ip_ct_generic_timeout; -+ unsigned int _ip_ct_log_invalid; -+ unsigned long _ip_ct_tcp_timeout_max_retrans; -+ int _ip_ct_tcp_loose; -+ int _ip_ct_tcp_be_liberal; -+ int _ip_ct_tcp_max_retrans; -+ struct ctl_table_header *_ip_ct_sysctl_header; -+ ctl_table *_ip_ct_net_table; -+ ctl_table *_ip_ct_ipv4_table; -+ ctl_table *_ip_ct_netfilter_table; -+ ctl_table *_ip_ct_sysctl_table; -+#endif /*CONFIG_SYSCTL*/ -+ -+ struct ip_nat_protocol **_ip_nat_protos; -+ ip_nat_helper_func _ip_nat_ftp_hook; -+ ip_nat_helper_func _ip_nat_irc_hook; -+ struct list_head *_ip_nat_bysource; -+ struct xt_table *_ip_nat_table; -+ -+ /* resource accounting */ -+ struct user_beancounter *ub; -+}; -+#endif -+#endif -+ -+#define UIDHASH_BITS_VE 6 -+#define UIDHASH_SZ_VE (1 << UIDHASH_BITS_VE) -+ -+struct ve_cpu_stats { -+ cycles_t idle_time; -+ cycles_t iowait_time; -+ cycles_t strt_idle_time; -+ cycles_t used_time; -+ seqcount_t stat_lock; -+ int nr_running; -+ int nr_unint; -+ int nr_iowait; -+ cputime64_t user; -+ cputime64_t nice; -+ cputime64_t system; -+} ____cacheline_aligned; -+ -+struct ve_struct { -+ struct ve_struct *prev; -+ struct ve_struct *next; -+ -+ envid_t veid; -+ struct task_struct *init_entry; -+ struct list_head vetask_lh; -+ kernel_cap_t cap_default; -+ atomic_t pcounter; -+ /* ref counter to ve from ipc */ -+ atomic_t counter; -+ unsigned int class_id; -+ struct veip_struct *veip; -+ struct rw_semaphore op_sem; -+ int is_running; -+ int is_locked; -+ int virt_pids; -+ /* see vzcalluser.h for VE_FEATURE_XXX definitions */ -+ __u64 features; -+ -+/* VE's root */ -+ struct vfsmount *fs_rootmnt; -+ struct dentry *fs_root; -+ -+/* sysctl */ -+ struct new_utsname *utsname; -+ struct list_head sysctl_lh; -+ struct ctl_table_header *kern_header; -+ struct ctl_table *kern_table; -+ struct ctl_table_header *quota_header; -+ struct ctl_table *quota_table; -+ struct file_system_type *proc_fstype; -+ struct vfsmount *proc_mnt; -+ struct proc_dir_entry *proc_root; -+ struct proc_dir_entry *proc_sys_root; -+ struct proc_dir_entry *_proc_net; -+ struct proc_dir_entry *_proc_net_stat; -+ -+/* SYSV IPC */ -+ struct ipc_ids *_shm_ids; -+ struct ipc_ids *_msg_ids; -+ struct ipc_ids *_sem_ids; -+ int _used_sems; -+ int _shm_tot; -+ size_t _shm_ctlmax; -+ size_t _shm_ctlall; -+ int _shm_ctlmni; -+ int _msg_ctlmax; -+ int _msg_ctlmni; -+ int _msg_ctlmnb; -+ int _sem_ctls[4]; -+ -+/* BSD pty's */ -+ struct tty_driver *pty_driver; -+ struct tty_driver *pty_slave_driver; -+ -+#ifdef CONFIG_UNIX98_PTYS -+ struct tty_driver *ptm_driver; -+ struct tty_driver *pts_driver; -+ struct idr *allocated_ptys; -+ struct file_system_type *devpts_fstype; -+ struct vfsmount *devpts_mnt; -+ struct dentry *devpts_root; -+ struct devpts_config *devpts_config; -+#endif -+ -+ struct file_system_type *shmem_fstype; -+ struct vfsmount *shmem_mnt; -+#ifdef CONFIG_SYSFS -+ struct file_system_type *sysfs_fstype; -+ struct vfsmount *sysfs_mnt; -+ struct super_block *sysfs_sb; -+ struct sysfs_dirent *sysfs_root; -+#endif -+ struct subsystem *class_subsys; -+ struct subsystem *class_obj_subsys; -+ struct class *net_class; -+ -+/* User uids hash */ -+ struct list_head uidhash_table[UIDHASH_SZ_VE]; -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ struct hlist_head _net_dev_head; -+ struct hlist_head _net_dev_index_head; -+ struct net_device *_net_dev_base, **_net_dev_tail; -+ int ifindex; -+ struct net_device *_loopback_dev; -+ struct net_device *_venet_dev; -+ struct ipv4_devconf *_ipv4_devconf; -+ struct ipv4_devconf *_ipv4_devconf_dflt; -+ struct ctl_table_header *forward_header; -+ struct ctl_table *forward_table; -+#endif -+ unsigned long rt_flush_required; -+ -+/* per VE CPU stats*/ -+ struct timespec start_timespec; -+ u64 start_jiffies; -+ cycles_t start_cycles; -+ unsigned long avenrun[3]; /* loadavg data */ -+ -+ cycles_t cpu_used_ve; -+ struct kstat_lat_pcpu_struct sched_lat_ve; -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ struct hlist_head *_fib_info_hash; -+ struct hlist_head *_fib_info_laddrhash; -+ int _fib_hash_size; -+ int _fib_info_cnt; -+ -+ struct fib_rule *_local_rule; -+ struct fib_rule *_fib_rules; -+#ifdef CONFIG_IP_MULTIPLE_TABLES -+ /* XXX: why a magic constant? */ -+ struct fib_table *_fib_tables[256]; /* RT_TABLE_MAX - for now */ -+#else -+ struct fib_table *_main_table; -+ struct fib_table *_local_table; -+#endif -+ struct icmp_mib *_icmp_statistics[2]; -+ struct ipstats_mib *_ip_statistics[2]; -+ struct tcp_mib *_tcp_statistics[2]; -+ struct udp_mib *_udp_statistics[2]; -+ struct linux_mib *_net_statistics[2]; -+ struct venet_stat *stat; -+#ifdef CONFIG_VE_IPTABLES -+/* core/netfilter.c virtualization */ -+ void *_nf_hooks; -+ struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */ -+ struct xt_table *_ipt_mangle_table; -+ struct xt_af *_xt; -+ struct xt_target *_ipt_standard_target; -+ -+ __u64 _iptables_modules; -+ struct ve_ip_conntrack *_ip_conntrack; -+#endif /* CONFIG_VE_IPTABLES */ -+#endif -+ wait_queue_head_t *_log_wait; -+ unsigned long *_log_start; -+ unsigned long *_log_end; -+ unsigned long *_logged_chars; -+ char *log_buf; -+#define VE_DEFAULT_LOG_BUF_LEN 4096 -+ -+ struct ve_cpu_stats ve_cpu_stats[NR_CPUS] ____cacheline_aligned; -+ unsigned long down_at; -+ struct list_head cleanup_list; -+ -+ unsigned long jiffies_fixup; -+ unsigned char disable_net; -+ unsigned char sparse_vpid; -+ struct ve_monitor *monitor; -+ struct proc_dir_entry *monitor_proc; -+}; -+ -+#define VE_CPU_STATS(ve, cpu) (&((ve)->ve_cpu_stats[(cpu)])) -+ -+extern int nr_ve; -+ -+#ifdef CONFIG_VE -+ -+int get_device_perms_ve(int dev_type, dev_t dev, int access_mode); -+void do_env_cleanup(struct ve_struct *envid); -+void do_update_load_avg_ve(void); -+void do_env_free(struct ve_struct *ptr); -+ -+#define ve_utsname (*get_exec_env()->utsname) -+ -+static inline struct ve_struct *get_ve(struct ve_struct *ptr) -+{ -+ if (ptr != NULL) -+ atomic_inc(&ptr->counter); -+ return ptr; -+} -+ -+static inline void put_ve(struct ve_struct *ptr) -+{ -+ if (ptr && atomic_dec_and_test(&ptr->counter)) { -+ if (atomic_read(&ptr->pcounter) > 0) -+ BUG(); -+ if (ptr->is_running) -+ BUG(); -+ do_env_free(ptr); -+ } -+} -+ -+#ifdef CONFIG_FAIRSCHED -+#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask) -+#else -+#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0) -+#endif -+#else /* CONFIG_VE */ -+#define ve_utsname system_utsname -+#define get_ve(ve) (NULL) -+#define put_ve(ve) do { } while (0) -+#endif /* CONFIG_VE */ -+ -+#endif /* _LINUX_VE_H */ -diff -upr linux-2.6.16.orig/include/linux/ve_owner.h linux-2.6.16-026test009/include/linux/ve_owner.h ---- linux-2.6.16.orig/include/linux/ve_owner.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/ve_owner.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,32 @@ -+/* -+ * include/linux/ve_owner.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __VE_OWNER_H__ -+#define __VE_OWNER_H__ -+ -+#include <linux/config.h> -+#include <linux/vmalloc.h> -+ -+ -+#define DCL_VE_OWNER(name, type, member) -+ /* prototype declares static inline functions */ -+ -+#define DCL_VE_OWNER_PROTO(name, type, member) \ -+type; \ -+static inline struct ve_struct *VE_OWNER_##name(const type *obj) \ -+{ \ -+ return obj->member; \ -+} \ -+static inline void SET_VE_OWNER_##name(type *obj, struct ve_struct *ve) \ -+{ \ -+ obj->member = ve; \ -+} -+ -+#endif /* __VE_OWNER_H__ */ -diff -upr linux-2.6.16.orig/include/linux/ve_proto.h linux-2.6.16-026test009/include/linux/ve_proto.h ---- linux-2.6.16.orig/include/linux/ve_proto.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/ve_proto.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,75 @@ -+/* -+ * include/linux/ve_proto.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __VE_H__ -+#define __VE_H__ -+ -+#ifdef CONFIG_VE -+ -+extern struct semaphore ve_call_guard; -+extern rwlock_t ve_call_lock; -+ -+#ifdef CONFIG_SYSVIPC -+extern void prepare_ipc(void); -+extern int init_ve_ipc(struct ve_struct *); -+extern void fini_ve_ipc(struct ve_struct *); -+extern void ve_ipc_cleanup(void); -+#endif -+ -+#ifdef CONFIG_UNIX98_PTYS -+extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ -+extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ -+#endif -+ -+extern rwlock_t tty_driver_guard; -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+void ip_fragment_cleanup(struct ve_struct *envid); -+void tcp_v4_kill_ve_sockets(struct ve_struct *envid); -+struct fib_table * fib_hash_init(int id); -+int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr); -+extern int main_loopback_init(struct net_device*); -+int venet_init(void); -+#endif -+ -+extern struct ve_struct *ve_list_head; -+extern rwlock_t ve_list_guard; -+extern struct ve_struct *get_ve_by_id(envid_t); -+extern struct ve_struct *__find_ve_by_id(envid_t); -+ -+struct env_create_param2; -+extern int real_env_create(envid_t veid, unsigned flags, u32 class_id, -+ struct env_create_param2 *data, int datalen); -+ -+extern int do_setdevperms(envid_t veid, unsigned type, -+ dev_t dev, unsigned mask); -+ -+#define VE_HOOK_INIT 0 -+#define VE_HOOK_FINI 1 -+#define VE_MAX_HOOKS 2 -+ -+typedef int ve_hookfn(unsigned int hooknum, void *data); -+ -+struct ve_hook -+{ -+ struct list_head list; -+ ve_hookfn *hook; -+ ve_hookfn *undo; -+ struct module *owner; -+ int hooknum; -+ /* Functions are called in ascending priority. */ -+ int priority; -+}; -+ -+extern int ve_hook_register(struct ve_hook *vh); -+extern void ve_hook_unregister(struct ve_hook *vh); -+ -+#endif -+#endif -diff -upr linux-2.6.16.orig/include/linux/ve_task.h linux-2.6.16-026test009/include/linux/ve_task.h ---- linux-2.6.16.orig/include/linux/ve_task.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/ve_task.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,34 @@ -+/* -+ * include/linux/ve_task.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __VE_TASK_H__ -+#define __VE_TASK_H__ -+ -+#include <linux/seqlock.h> -+ -+struct ve_task_info { -+/* virtualization */ -+ struct ve_struct *owner_env; -+ struct ve_struct *exec_env; -+ struct list_head vetask_list; -+ struct dentry *glob_proc_dentry; -+/* statistics: scheduling latency */ -+ cycles_t sleep_time; -+ cycles_t sched_time; -+ cycles_t sleep_stamp; -+ cycles_t wakeup_stamp; -+ seqcount_t wakeup_lock; -+}; -+ -+#define VE_TASK_INFO(task) (&(task)->ve_task_info) -+#define VE_TASK_LIST_2_TASK(lh) \ -+ list_entry(lh, struct task_struct, ve_task_info.vetask_list) -+ -+#endif /* __VE_TASK_H__ */ -diff -upr linux-2.6.16.orig/include/linux/venet.h linux-2.6.16-026test009/include/linux/venet.h ---- linux-2.6.16.orig/include/linux/venet.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/venet.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,68 @@ -+/* -+ * include/linux/venet.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _VENET_H -+#define _VENET_H -+ -+#include <linux/list.h> -+#include <linux/spinlock.h> -+#include <linux/vzcalluser.h> -+ -+#define VEIP_HASH_SZ 512 -+ -+struct ve_struct; -+struct venet_stat; -+struct ip_entry_struct -+{ -+ __u32 ip; -+ struct ve_struct *active_env; -+ struct venet_stat *stat; -+ struct veip_struct *veip; -+ struct list_head ip_hash; -+ struct list_head ve_list; -+}; -+ -+struct veip_struct -+{ -+ struct list_head src_lh; -+ struct list_head dst_lh; -+ struct list_head ip_lh; -+ struct list_head list; -+ envid_t veid; -+}; -+ -+/* veip_hash_lock should be taken for write by caller */ -+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip); -+/* veip_hash_lock should be taken for write by caller */ -+void ip_entry_unhash(struct ip_entry_struct *entry); -+/* veip_hash_lock should be taken for read by caller */ -+struct ip_entry_struct *ip_entry_lookup(u32 addr); -+ -+/* veip_hash_lock should be taken for read by caller */ -+struct veip_struct *veip_find(envid_t veid); -+/* veip_hash_lock should be taken for write by caller */ -+struct veip_struct *veip_findcreate(envid_t veid); -+/* veip_hash_lock should be taken for write by caller */ -+void veip_put(struct veip_struct *veip); -+ -+int veip_start(struct ve_struct *ve); -+void veip_stop(struct ve_struct *ve); -+int veip_entry_add(struct ve_struct *ve, struct sockaddr_in *addr); -+int veip_entry_del(envid_t veid, struct sockaddr_in *addr); -+int venet_change_skb_owner(struct sk_buff *skb); -+ -+extern struct list_head ip_entry_hash_table[]; -+extern rwlock_t veip_hash_lock; -+ -+#ifdef CONFIG_PROC_FS -+int veip_seq_show(struct seq_file *m, void *v); -+#endif -+ -+#endif -diff -upr linux-2.6.16.orig/include/linux/veprintk.h linux-2.6.16-026test009/include/linux/veprintk.h ---- linux-2.6.16.orig/include/linux/veprintk.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/veprintk.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,38 @@ -+/* -+ * include/linux/veprintk.h -+ * -+ * Copyright (C) 2006 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __VE_PRINTK_H__ -+#define __VE_PRINTK_H__ -+ -+#ifdef CONFIG_VE -+ -+#define ve_log_wait (*(get_exec_env()->_log_wait)) -+#define ve_log_start (*(get_exec_env()->_log_start)) -+#define ve_log_end (*(get_exec_env()->_log_end)) -+#define ve_logged_chars (*(get_exec_env()->_logged_chars)) -+#define ve_log_buf (get_exec_env()->log_buf) -+#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \ -+ log_buf_len : VE_DEFAULT_LOG_BUF_LEN) -+#define VE_LOG_BUF_MASK (ve_log_buf_len - 1) -+#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK]) -+ -+#else -+ -+#define ve_log_wait log_wait -+#define ve_log_start log_start -+#define ve_log_end log_end -+#define ve_logged_chars logged_chars -+#define ve_log_buf log_buf -+#define ve_log_buf_len log_buf_len -+#define VE_LOG_BUF_MASK LOG_BUF_MASK -+#define VE_LOG_BUF(idx) LOG_BUF(idx) -+ -+#endif /* CONFIG_VE */ -+#endif /* __VE_PRINTK_H__ */ -diff -upr linux-2.6.16.orig/include/linux/virtinfo.h linux-2.6.16-026test009/include/linux/virtinfo.h ---- linux-2.6.16.orig/include/linux/virtinfo.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/virtinfo.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,52 @@ -+/* -+ * include/linux/virtinfo.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __LINUX_VIRTINFO_H -+#define __LINUX_VIRTINFO_H -+ -+#include <linux/kernel.h> -+#include <linux/page-flags.h> -+#include <linux/rwsem.h> -+#include <linux/notifier.h> -+ -+struct vnotifier_block -+{ -+ int (*notifier_call)(struct vnotifier_block *self, -+ unsigned long, void *, int); -+ struct vnotifier_block *next; -+ int priority; -+}; -+ -+void virtinfo_notifier_register(int type, struct vnotifier_block *nb); -+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb); -+int virtinfo_notifier_call(int type, unsigned long n, void *data); -+ -+struct meminfo { -+ struct sysinfo si; -+ unsigned long active, inactive; -+ unsigned long cache, swapcache; -+ unsigned long committed_space; -+ unsigned long allowed; -+ struct page_state ps; -+ unsigned long vmalloc_total, vmalloc_used, vmalloc_largest; -+}; -+ -+#define VIRTINFO_MEMINFO 0 -+#define VIRTINFO_ENOUGHMEM 1 -+ -+enum virt_info_types { -+ VITYPE_GENERAL, -+ VITYPE_FAUDIT, -+ VITYPE_QUOTA, -+ -+ VIRT_TYPES -+}; -+ -+#endif /* __LINUX_VIRTINFO_H */ -diff -upr linux-2.6.16.orig/include/linux/vmalloc.h linux-2.6.16-026test009/include/linux/vmalloc.h ---- linux-2.6.16.orig/include/linux/vmalloc.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vmalloc.h 2006-04-19 15:02:12.000000000 +0400 -@@ -18,6 +18,10 @@ - #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ - #endif - -+/* align size to 2^n page boundary */ -+#define POWER2_PAGE_ALIGN(size) \ -+ ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size)))) -+ - struct vm_struct { - void *addr; - unsigned long size; -@@ -36,6 +40,8 @@ extern void *vmalloc_node(unsigned long - extern void *vmalloc_exec(unsigned long size); - extern void *vmalloc_32(unsigned long size); - extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -+extern void *vmalloc_best(unsigned long size); -+extern void *ub_vmalloc_best(unsigned long size); - extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot); - extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, -@@ -52,6 +58,9 @@ extern void vunmap(void *addr); - extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); - extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end); -+extern struct vm_struct * get_vm_area_best(unsigned long size, -+ unsigned long flags); -+extern void vprintstat(void); - extern struct vm_struct *get_vm_area_node(unsigned long size, - unsigned long flags, int node); - extern struct vm_struct *remove_vm_area(void *addr); -diff -upr linux-2.6.16.orig/include/linux/vzcalluser.h linux-2.6.16-026test009/include/linux/vzcalluser.h ---- linux-2.6.16.orig/include/linux/vzcalluser.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzcalluser.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,221 @@ -+/* -+ * include/linux/vzcalluser.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _LINUX_VZCALLUSER_H -+#define _LINUX_VZCALLUSER_H -+ -+#include <linux/types.h> -+#include <linux/ioctl.h> -+ -+#define KERN_VZ_PRIV_RANGE 51 -+ -+#ifndef __ENVID_T_DEFINED__ -+typedef unsigned envid_t; -+#define __ENVID_T_DEFINED__ -+#endif -+ -+/* -+ * VE management ioctls -+ */ -+ -+struct vzctl_old_env_create { -+ envid_t veid; -+ unsigned flags; -+#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */ -+#define VE_EXCLUSIVE 2 /* Fail if exists */ -+#define VE_ENTER 4 /* Enter existing VE */ -+#define VE_TEST 8 /* Test if VE exists */ -+#define VE_LOCK 16 /* Do not allow entering created VE */ -+#define VE_SKIPLOCK 32 /* Allow entering embrion VE */ -+ __u32 addr; -+}; -+ -+struct vzctl_mark_env_to_down { -+ envid_t veid; -+}; -+ -+struct vzctl_setdevperms { -+ envid_t veid; -+ unsigned type; -+#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */ -+#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */ -+#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */ -+ unsigned dev; -+ unsigned mask; -+}; -+ -+struct vzctl_ve_netdev { -+ envid_t veid; -+ int op; -+#define VE_NETDEV_ADD 1 -+#define VE_NETDEV_DEL 2 -+ char *dev_name; -+}; -+ -+/* these masks represent modules */ -+#define VE_IP_IPTABLES_MOD (1U<<0) -+#define VE_IP_FILTER_MOD (1U<<1) -+#define VE_IP_MANGLE_MOD (1U<<2) -+#define VE_IP_MATCH_LIMIT_MOD (1U<<3) -+#define VE_IP_MATCH_MULTIPORT_MOD (1U<<4) -+#define VE_IP_MATCH_TOS_MOD (1U<<5) -+#define VE_IP_TARGET_TOS_MOD (1U<<6) -+#define VE_IP_TARGET_REJECT_MOD (1U<<7) -+#define VE_IP_TARGET_TCPMSS_MOD (1U<<8) -+#define VE_IP_MATCH_TCPMSS_MOD (1U<<9) -+#define VE_IP_MATCH_TTL_MOD (1U<<10) -+#define VE_IP_TARGET_LOG_MOD (1U<<11) -+#define VE_IP_MATCH_LENGTH_MOD (1U<<12) -+#define VE_IP_CONNTRACK_MOD (1U<<14) -+#define VE_IP_CONNTRACK_FTP_MOD (1U<<15) -+#define VE_IP_CONNTRACK_IRC_MOD (1U<<16) -+#define VE_IP_MATCH_CONNTRACK_MOD (1U<<17) -+#define VE_IP_MATCH_STATE_MOD (1U<<18) -+#define VE_IP_MATCH_HELPER_MOD (1U<<19) -+#define VE_IP_NAT_MOD (1U<<20) -+#define VE_IP_NAT_FTP_MOD (1U<<21) -+#define VE_IP_NAT_IRC_MOD (1U<<22) -+#define VE_IP_TARGET_REDIRECT_MOD (1U<<23) -+ -+/* these masks represent modules with their dependences */ -+#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD) -+#define VE_IP_FILTER (VE_IP_FILTER_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_MANGLE (VE_IP_MANGLE_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_MATCH_LIMIT (VE_IP_MATCH_LIMIT_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_MATCH_MULTIPORT (VE_IP_MATCH_MULTIPORT_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_MATCH_TOS (VE_IP_MATCH_TOS_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_TARGET_TOS (VE_IP_TARGET_TOS_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_TARGET_REJECT (VE_IP_TARGET_REJECT_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_TARGET_TCPMSS (VE_IP_TARGET_TCPMSS_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_MATCH_TCPMSS (VE_IP_MATCH_TCPMSS_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_MATCH_TTL (VE_IP_MATCH_TTL_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_TARGET_LOG (VE_IP_TARGET_LOG_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_MATCH_LENGTH (VE_IP_MATCH_LENGTH_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD \ -+ | VE_IP_IPTABLES) -+#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD \ -+ | VE_IP_CONNTRACK) -+#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD \ -+ | VE_IP_CONNTRACK) -+#define VE_IP_MATCH_CONNTRACK (VE_IP_MATCH_CONNTRACK_MOD \ -+ | VE_IP_CONNTRACK) -+#define VE_IP_MATCH_STATE (VE_IP_MATCH_STATE_MOD \ -+ | VE_IP_CONNTRACK) -+#define VE_IP_MATCH_HELPER (VE_IP_MATCH_HELPER_MOD \ -+ | VE_IP_CONNTRACK) -+#define VE_IP_NAT (VE_IP_NAT_MOD \ -+ | VE_IP_CONNTRACK) -+#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD \ -+ | VE_IP_NAT | VE_IP_CONNTRACK_FTP) -+#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD \ -+ | VE_IP_NAT | VE_IP_CONNTRACK_IRC) -+#define VE_IP_TARGET_REDIRECT (VE_IP_TARGET_REDIRECT_MOD \ -+ | VE_IP_NAT) -+ -+/* safe iptables mask to be used by default */ -+#define VE_IP_DEFAULT \ -+ (VE_IP_IPTABLES | \ -+ VE_IP_FILTER | VE_IP_MANGLE | \ -+ VE_IP_MATCH_LIMIT | VE_IP_MATCH_MULTIPORT | \ -+ VE_IP_MATCH_TOS | VE_IP_TARGET_REJECT | \ -+ VE_IP_TARGET_TCPMSS | VE_IP_MATCH_TCPMSS | \ -+ VE_IP_MATCH_TTL | VE_IP_MATCH_LENGTH) -+ -+#define VE_IPT_CMP(x,y) (((x) & (y)) == (y)) -+ -+struct vzctl_env_create_cid { -+ envid_t veid; -+ unsigned flags; -+ __u32 class_id; -+}; -+ -+struct vzctl_env_create { -+ envid_t veid; -+ unsigned flags; -+ __u32 class_id; -+}; -+ -+struct env_create_param { -+ __u64 iptables_mask; -+}; -+ -+#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param) -+ -+struct env_create_param2 { -+ __u64 iptables_mask; -+ __u64 feature_mask; -+#define VE_FEATURE_SYSFS (1ULL << 0) -+ __u32 total_vcpus; /* 0 - don't care, same as in host */ -+}; -+#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(struct env_create_param2) -+ -+typedef struct env_create_param2 env_create_param_t; -+ -+struct vzctl_env_create_data { -+ envid_t veid; -+ unsigned flags; -+ __u32 class_id; -+ env_create_param_t *data; -+ int datalen; -+}; -+ -+struct vz_load_avg { -+ int val_int; -+ int val_frac; -+}; -+ -+struct vz_cpu_stat { -+ unsigned long user_jif; -+ unsigned long nice_jif; -+ unsigned long system_jif; -+ unsigned long uptime_jif; -+ __u64 idle_clk; -+ __u64 strv_clk; -+ __u64 uptime_clk; -+ struct vz_load_avg avenrun[3]; /* loadavg data */ -+}; -+ -+struct vzctl_cpustatctl { -+ envid_t veid; -+ struct vz_cpu_stat *cpustat; -+}; -+ -+#define VZCTLTYPE '.' -+#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \ -+ struct vzctl_old_env_create) -+#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \ -+ struct vzctl_mark_env_to_down) -+#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \ -+ struct vzctl_setdevperms) -+#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \ -+ struct vzctl_env_create_cid) -+#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \ -+ struct vzctl_env_create) -+#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \ -+ struct vzctl_cpustatctl) -+#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ -+ struct vzctl_env_create_data) -+#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \ -+ struct vzctl_ve_netdev) -+ -+ -+#endif -diff -upr linux-2.6.16.orig/include/linux/vzctl.h linux-2.6.16-026test009/include/linux/vzctl.h ---- linux-2.6.16.orig/include/linux/vzctl.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzctl.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,30 @@ -+/* -+ * include/linux/vzctl.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _LINUX_VZCTL_H -+#define _LINUX_VZCTL_H -+ -+#include <linux/list.h> -+ -+struct module; -+struct inode; -+struct file; -+struct vzioctlinfo { -+ unsigned type; -+ int (*func)(struct inode *, struct file *, -+ unsigned int, unsigned long); -+ struct module *owner; -+ struct list_head list; -+}; -+ -+extern void vzioctl_register(struct vzioctlinfo *inf); -+extern void vzioctl_unregister(struct vzioctlinfo *inf); -+ -+#endif -diff -upr linux-2.6.16.orig/include/linux/vzctl_quota.h linux-2.6.16-026test009/include/linux/vzctl_quota.h ---- linux-2.6.16.orig/include/linux/vzctl_quota.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzctl_quota.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,43 @@ -+/* -+ * include/linux/vzctl_quota.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __LINUX_VZCTL_QUOTA_H__ -+#define __LINUX_VZCTL_QUOTA_H__ -+ -+/* -+ * Quota management ioctl -+ */ -+ -+struct vz_quota_stat; -+struct vzctl_quotactl { -+ int cmd; -+ unsigned int quota_id; -+ struct vz_quota_stat *qstat; -+ char *ve_root; -+}; -+ -+struct vzctl_quotaugidctl { -+ int cmd; /* subcommand */ -+ unsigned int quota_id; /* quota id where it applies to */ -+ unsigned int ugid_index;/* for reading statistic. index of first -+ uid/gid record to read */ -+ unsigned int ugid_size; /* size of ugid_buf array */ -+ void *addr; /* user-level buffer */ -+}; -+ -+#define VZDQCTLTYPE '+' -+#define VZCTL_QUOTA_CTL _IOWR(VZDQCTLTYPE, 1, \ -+ struct vzctl_quotactl) -+#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \ -+ struct vzctl_quotactl) -+#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ -+ struct vzctl_quotaugidctl) -+ -+#endif /* __LINUX_VZCTL_QUOTA_H__ */ -diff -upr linux-2.6.16.orig/include/linux/vzctl_venet.h linux-2.6.16-026test009/include/linux/vzctl_venet.h ---- linux-2.6.16.orig/include/linux/vzctl_venet.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzctl_venet.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,36 @@ -+/* -+ * include/linux/vzctl_venet.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _VZCTL_VENET_H -+#define _VZCTL_VENET_H -+ -+#include <linux/types.h> -+#include <linux/ioctl.h> -+ -+#ifndef __ENVID_T_DEFINED__ -+typedef unsigned envid_t; -+#define __ENVID_T_DEFINED__ -+#endif -+ -+struct vzctl_ve_ip_map { -+ envid_t veid; -+ int op; -+#define VE_IP_ADD 1 -+#define VE_IP_DEL 2 -+ struct sockaddr *addr; -+ int addrlen; -+}; -+ -+#define VENETCTLTYPE '(' -+ -+#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ -+ struct vzctl_ve_ip_map) -+ -+#endif -diff -upr linux-2.6.16.orig/include/linux/vzdq_tree.h linux-2.6.16-026test009/include/linux/vzdq_tree.h ---- linux-2.6.16.orig/include/linux/vzdq_tree.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzdq_tree.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,99 @@ -+/* -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * This file contains Virtuozzo disk quota tree definition -+ */ -+ -+#ifndef _VZDQ_TREE_H -+#define _VZDQ_TREE_H -+ -+#include <linux/list.h> -+#include <asm/string.h> -+ -+typedef unsigned int quotaid_t; -+#define QUOTAID_BITS 32 -+#define QUOTAID_BBITS 4 -+#define QUOTAID_EBITS 8 -+ -+#if QUOTAID_EBITS % QUOTAID_BBITS -+#error Quota bit assumption failure -+#endif -+ -+#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS) -+#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1) -+#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \ -+ / QUOTAID_BBITS) -+#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \ -+ / QUOTAID_EBITS) -+#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS) -+ -+/* -+ * Depth of keeping unused node (not inclusive). -+ * 0 means release all nodes including root, -+ * QUOTATREE_DEPTH means never release nodes. -+ * Current value: release all nodes strictly after QUOTATREE_EDEPTH -+ * (measured in external shift units). -+ */ -+#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \ -+ - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \ -+ + 1) -+ -+/* -+ * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes. -+ * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS), -+ * and each node contains 2^QUOTAID_BBITS pointers. -+ * Level 0 is a (single) tree root node. -+ * -+ * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data. -+ * Nodes of lower levels contain pointers to nodes. -+ * -+ * Double pointer in array of i-level node, pointing to a (i+1)-level node -+ * (such as inside quotatree_find_state) are marked by level (i+1), not i. -+ * Level 0 double pointer is a pointer to root inside tree struct. -+ * -+ * The tree is permanent, i.e. all index blocks allocated are keeped alive to -+ * preserve the blocks numbers in the quota file tree to keep its changes -+ * locally. -+ */ -+struct quotatree_node { -+ struct list_head list; -+ quotaid_t num; -+ void *blocks[QUOTATREE_BSIZE]; -+}; -+ -+struct quotatree_level { -+ struct list_head usedlh, freelh; -+ quotaid_t freenum; -+}; -+ -+struct quotatree_tree { -+ struct quotatree_level levels[QUOTATREE_DEPTH]; -+ struct quotatree_node *root; -+ unsigned int leaf_num; -+}; -+ -+struct quotatree_find_state { -+ void **block; -+ int level; -+}; -+ -+/* number of leafs (objects) and leaf level of the tree */ -+#define QTREE_LEAFNUM(tree) ((tree)->leaf_num) -+#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1]) -+ -+struct quotatree_tree *quotatree_alloc(void); -+void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, -+ struct quotatree_find_state *st); -+int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, -+ struct quotatree_find_state *st, void *data); -+void quotatree_remove(struct quotatree_tree *tree, quotaid_t id); -+void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)); -+void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id); -+void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index); -+ -+#endif /* _VZDQ_TREE_H */ -+ -diff -upr linux-2.6.16.orig/include/linux/vzquota.h linux-2.6.16-026test009/include/linux/vzquota.h ---- linux-2.6.16.orig/include/linux/vzquota.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzquota.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,291 @@ -+/* -+ * -+ * Copyright (C) 2001-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * This file contains Virtuozzo disk quota implementation -+ */ -+ -+#ifndef _VZDQUOTA_H -+#define _VZDQUOTA_H -+ -+#include <linux/types.h> -+#include <linux/quota.h> -+ -+/* vzquotactl syscall commands */ -+#define VZ_DQ_CREATE 5 /* create quota master block */ -+#define VZ_DQ_DESTROY 6 /* destroy qmblk */ -+#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */ -+#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */ -+#define VZ_DQ_SETLIMIT 9 /* set new limits */ -+#define VZ_DQ_GETSTAT 10 /* get usage statistic */ -+/* set of syscalls to maintain UGID quotas */ -+#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */ -+#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */ -+#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */ -+#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */ -+#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */ -+#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */ -+#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */ -+#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */ -+ -+/* common structure for vz and ugid quota */ -+struct dq_stat { -+ /* blocks limits */ -+ __u64 bhardlimit; /* absolute limit in bytes */ -+ __u64 bsoftlimit; /* preferred limit in bytes */ -+ time_t btime; /* time limit for excessive disk use */ -+ __u64 bcurrent; /* current bytes count */ -+ /* inodes limits */ -+ __u32 ihardlimit; /* absolute limit on allocated inodes */ -+ __u32 isoftlimit; /* preferred inode limit */ -+ time_t itime; /* time limit for excessive inode use */ -+ __u32 icurrent; /* current # allocated inodes */ -+}; -+ -+/* One second resolution for grace times */ -+#define CURRENT_TIME_SECONDS (get_seconds()) -+ -+/* Values for dq_info->flags */ -+#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ -+#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ -+ -+struct dq_info { -+ time_t bexpire; /* expire timeout for excessive disk use */ -+ time_t iexpire; /* expire timeout for excessive inode use */ -+ unsigned flags; /* see previos defines */ -+}; -+ -+struct vz_quota_stat { -+ struct dq_stat dq_stat; -+ struct dq_info dq_info; -+}; -+ -+/* UID/GID interface record - for user-kernel level exchange */ -+struct vz_quota_iface { -+ unsigned int qi_id; /* UID/GID this applies to */ -+ unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ -+ struct dq_stat qi_stat; /* limits, options, usage stats */ -+}; -+ -+/* values for flags and dq_flags */ -+/* this flag is set if the userspace has been unable to provide usage -+ * information about all ugids -+ * if the flag is set, we don't allocate new UG quota blocks (their -+ * current usage is unknown) or free existing UG quota blocks (not to -+ * lose information that this block is ok) */ -+#define VZDQUG_FIXED_SET 0x01 -+/* permit to use ugid quota */ -+#define VZDQUG_ON 0x02 -+#define VZDQ_USRQUOTA 0x10 -+#define VZDQ_GRPQUOTA 0x20 -+#define VZDQ_NOACT 0x1000 /* not actual */ -+#define VZDQ_NOQUOT 0x2000 /* not under quota tree */ -+ -+struct vz_quota_ugid_stat { -+ unsigned int limit; /* max amount of ugid records */ -+ unsigned int count; /* amount of ugid records */ -+ unsigned int flags; -+}; -+ -+struct vz_quota_ugid_setlimit { -+ unsigned int type; /* quota type (USR/GRP) */ -+ unsigned int id; /* ugid */ -+ struct if_dqblk dqb; /* limits info */ -+}; -+ -+struct vz_quota_ugid_setinfo { -+ unsigned int type; /* quota type (USR/GRP) */ -+ struct if_dqinfo dqi; /* grace info */ -+}; -+ -+#ifdef __KERNEL__ -+#include <linux/list.h> -+#include <asm/atomic.h> -+#include <asm/semaphore.h> -+#include <linux/time.h> -+#include <linux/vzquota_qlnk.h> -+#include <linux/vzdq_tree.h> -+ -+/* Values for dq_info flags */ -+#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ -+#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ -+ -+/* values for dq_state */ -+#define VZDQ_STARTING 0 /* created, not turned on yet */ -+#define VZDQ_WORKING 1 /* quota created, turned on */ -+#define VZDQ_STOPING 2 /* created, turned on and off */ -+ -+/* master quota record - one per veid */ -+struct vz_quota_master { -+ struct list_head dq_hash; /* next quota in hash list */ -+ atomic_t dq_count; /* inode reference count */ -+ unsigned int dq_flags; /* see VZDQUG_FIXED_SET */ -+ unsigned int dq_state; /* see values above */ -+ unsigned int dq_id; /* VEID this applies to */ -+ struct dq_stat dq_stat; /* limits, grace, usage stats */ -+ struct dq_info dq_info; /* grace times and flags */ -+ spinlock_t dq_data_lock; /* for dq_stat */ -+ -+ struct semaphore dq_sem; /* semaphore to protect -+ ugid tree */ -+ -+ struct list_head dq_ilink_list; /* list of vz_quota_ilink */ -+ struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */ -+ struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */ -+ unsigned int dq_ugid_count; /* amount of ugid records */ -+ unsigned int dq_ugid_max; /* max amount of ugid records */ -+ struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */ -+ -+ struct dentry *dq_root_dentry;/* dentry of fs tree */ -+ struct vfsmount *dq_root_mnt; /* vfsmnt of this dentry */ -+ struct super_block *dq_sb; /* superblock of our quota root */ -+}; -+ -+/* UID/GID quota record - one per pair (quota_master, uid or gid) */ -+struct vz_quota_ugid { -+ unsigned int qugid_id; /* UID/GID this applies to */ -+ struct dq_stat qugid_stat; /* limits, options, usage stats */ -+ int qugid_type; /* USRQUOTA|GRPQUOTA */ -+ atomic_t qugid_count; /* reference count */ -+}; -+ -+#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11) -+ -+struct vz_quota_datast { -+ struct vz_quota_ilink qlnk; -+}; -+ -+#define VIRTINFO_QUOTA_GETSTAT 0 -+#define VIRTINFO_QUOTA_ON 1 -+#define VIRTINFO_QUOTA_OFF 2 -+ -+struct virt_info_quota { -+ struct super_block *super; -+ struct dq_stat *qstat; -+}; -+ -+/* -+ * Interface to VZ quota core -+ */ -+#define INODE_QLNK(inode) (&(inode)->i_qlnk) -+#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk) -+ -+#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef) -+ -+#define VZ_QUOTAO_SETE 1 -+#define VZ_QUOTAO_INIT 2 -+#define VZ_QUOTAO_DESTR 3 -+#define VZ_QUOTAO_SWAP 4 -+#define VZ_QUOTAO_INICAL 5 -+#define VZ_QUOTAO_DRCAL 6 -+#define VZ_QUOTAO_QSET 7 -+#define VZ_QUOTAO_TRANS 8 -+#define VZ_QUOTAO_ACT 9 -+#define VZ_QUOTAO_DTREE 10 -+#define VZ_QUOTAO_DET 11 -+#define VZ_QUOTAO_ON 12 -+ -+extern struct semaphore vz_quota_sem; -+void inode_qmblk_lock(struct super_block *sb); -+void inode_qmblk_unlock(struct super_block *sb); -+void qmblk_data_read_lock(struct vz_quota_master *qmblk); -+void qmblk_data_read_unlock(struct vz_quota_master *qmblk); -+void qmblk_data_write_lock(struct vz_quota_master *qmblk); -+void qmblk_data_write_unlock(struct vz_quota_master *qmblk); -+ -+/* for quota operations */ -+void vzquota_inode_init_call(struct inode *inode); -+void vzquota_inode_drop_call(struct inode *inode); -+int vzquota_inode_transfer_call(struct inode *, struct iattr *); -+struct vz_quota_master *vzquota_inode_data(struct inode *inode, -+ struct vz_quota_datast *); -+void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *); -+int vzquota_rename_check(struct inode *inode, -+ struct inode *old_dir, struct inode *new_dir); -+struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode); -+/* for second-level quota */ -+struct vz_quota_master *vzquota_find_qmblk(struct super_block *); -+/* for management operations */ -+struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, -+ struct vz_quota_stat *qstat); -+void vzquota_free_master(struct vz_quota_master *); -+struct vz_quota_master *vzquota_find_master(unsigned int quota_id); -+int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, -+ struct vz_quota_master *qmblk); -+int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk); -+int vzquota_get_super(struct super_block *sb); -+void vzquota_put_super(struct super_block *sb); -+ -+static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk) -+{ -+ if (!atomic_read(&qmblk->dq_count)) -+ BUG(); -+ atomic_inc(&qmblk->dq_count); -+ return qmblk; -+} -+ -+static inline void __qmblk_put(struct vz_quota_master *qmblk) -+{ -+ atomic_dec(&qmblk->dq_count); -+} -+ -+static inline void qmblk_put(struct vz_quota_master *qmblk) -+{ -+ if (!atomic_dec_and_test(&qmblk->dq_count)) -+ return; -+ vzquota_free_master(qmblk); -+} -+ -+extern struct list_head vzquota_hash_table[]; -+extern int vzquota_hash_size; -+ -+/* -+ * Interface to VZ UGID quota -+ */ -+extern struct quotactl_ops vz_quotactl_operations; -+extern struct dquot_operations vz_quota_operations2; -+extern struct quota_format_type vz_quota_empty_v2_format; -+ -+#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \ -+ qmblk->dq_uid_tree : \ -+ qmblk->dq_gid_tree) -+ -+#define VZDQUG_FIND_DONT_ALLOC 1 -+#define VZDQUG_FIND_FAKE 2 -+struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, -+ unsigned int quota_id, int type, int flags); -+struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, -+ unsigned int quota_id, int type, int flags); -+struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid); -+void vzquota_put_ugid(struct vz_quota_master *qmblk, -+ struct vz_quota_ugid *qugid); -+void vzquota_kill_ugid(struct vz_quota_master *qmblk); -+int vzquota_ugid_init(void); -+void vzquota_ugid_release(void); -+int vzquota_transfer_usage(struct inode *inode, int mask, -+ struct vz_quota_ilink *qlnk); -+ -+struct vzctl_quotaugidctl; -+long do_vzquotaugidctl(struct vzctl_quotaugidctl *qub); -+ -+/* -+ * Other VZ quota parts -+ */ -+extern struct dquot_operations vz_quota_operations; -+ -+long do_vzquotactl(int cmd, unsigned int quota_id, -+ struct vz_quota_stat *qstat, const char *ve_root); -+int vzquota_proc_init(void); -+void vzquota_proc_release(void); -+struct vz_quota_master *vzquota_find_qmblk(struct super_block *); -+extern struct semaphore vz_quota_sem; -+ -+void vzaquota_init(void); -+void vzaquota_fini(void); -+ -+#endif /* __KERNEL__ */ -+ -+#endif /* _VZDQUOTA_H */ -diff -upr linux-2.6.16.orig/include/linux/vzquota_qlnk.h linux-2.6.16-026test009/include/linux/vzquota_qlnk.h ---- linux-2.6.16.orig/include/linux/vzquota_qlnk.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzquota_qlnk.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,25 @@ -+/* -+ * include/linux/vzquota_qlnk.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _VZDQUOTA_QLNK_H -+#define _VZDQUOTA_QLNK_H -+ -+struct vz_quota_master; -+struct vz_quota_ugid; -+ -+/* inode link, used to track inodes using quota via dq_ilink_list */ -+struct vz_quota_ilink { -+ struct vz_quota_master *qmblk; -+ struct vz_quota_ugid *qugid[MAXQUOTAS]; -+ struct list_head list; -+ unsigned char origin; -+}; -+ -+#endif /* _VZDQUOTA_QLNK_H */ -diff -upr linux-2.6.16.orig/include/linux/vzratelimit.h linux-2.6.16-026test009/include/linux/vzratelimit.h ---- linux-2.6.16.orig/include/linux/vzratelimit.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzratelimit.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,28 @@ -+/* -+ * include/linux/vzratelimit.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __VZ_RATELIMIT_H__ -+#define __VZ_RATELIMIT_H__ -+ -+/* -+ * Generic ratelimiting stuff. -+ */ -+ -+struct vz_rate_info { -+ int burst; -+ int interval; /* jiffy_t per event */ -+ int bucket; /* kind of leaky bucket */ -+ unsigned long last; /* last event */ -+}; -+ -+/* Return true if rate limit permits. */ -+int vz_ratelimit(struct vz_rate_info *p); -+ -+#endif /* __VZ_RATELIMIT_H__ */ -diff -upr linux-2.6.16.orig/include/linux/vzstat.h linux-2.6.16-026test009/include/linux/vzstat.h ---- linux-2.6.16.orig/include/linux/vzstat.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/linux/vzstat.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,182 @@ -+/* -+ * include/linux/vzstat.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __VZSTAT_H__ -+#define __VZSTAT_H__ -+ -+struct swap_cache_info_struct { -+ unsigned long add_total; -+ unsigned long del_total; -+ unsigned long find_success; -+ unsigned long find_total; -+ unsigned long noent_race; -+ unsigned long exist_race; -+ unsigned long remove_race; -+}; -+ -+struct kstat_lat_snap_struct { -+ cycles_t maxlat, totlat; -+ unsigned long count; -+}; -+struct kstat_lat_pcpu_snap_struct { -+ cycles_t maxlat, totlat; -+ unsigned long count; -+ seqcount_t lock; -+} ____cacheline_aligned_in_smp; -+ -+struct kstat_lat_struct { -+ struct kstat_lat_snap_struct cur, last; -+ cycles_t avg[3]; -+}; -+struct kstat_lat_pcpu_struct { -+ struct kstat_lat_pcpu_snap_struct cur[NR_CPUS]; -+ cycles_t max_snap; -+ struct kstat_lat_snap_struct last; -+ cycles_t avg[3]; -+}; -+ -+struct kstat_perf_snap_struct { -+ cycles_t wall_tottime, cpu_tottime; -+ cycles_t wall_maxdur, cpu_maxdur; -+ unsigned long count; -+}; -+struct kstat_perf_struct { -+ struct kstat_perf_snap_struct cur, last; -+}; -+ -+struct kstat_zone_avg { -+ unsigned long free_pages_avg[3], -+ nr_active_avg[3], -+ nr_inactive_avg[3]; -+}; -+ -+#define KSTAT_ALLOCSTAT_NR 5 -+ -+struct kernel_stat_glob { -+ unsigned long nr_unint_avg[3]; -+ -+ unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR]; -+ struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR]; -+ struct kstat_lat_pcpu_struct sched_lat; -+ struct kstat_lat_struct swap_in; -+ -+ struct kstat_perf_struct ttfp, cache_reap, -+ refill_inact, shrink_icache, shrink_dcache; -+ -+ struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */ -+} ____cacheline_aligned; -+ -+extern struct kernel_stat_glob kstat_glob ____cacheline_aligned; -+extern spinlock_t kstat_glb_lock; -+ -+#ifdef CONFIG_VE -+#define KSTAT_PERF_ENTER(name) \ -+ unsigned long flags; \ -+ cycles_t start, sleep_time; \ -+ \ -+ start = get_cycles(); \ -+ sleep_time = VE_TASK_INFO(current)->sleep_time; \ -+ -+#define KSTAT_PERF_LEAVE(name) \ -+ spin_lock_irqsave(&kstat_glb_lock, flags); \ -+ kstat_glob.name.cur.count++; \ -+ start = get_cycles() - start; \ -+ if (kstat_glob.name.cur.wall_maxdur < start) \ -+ kstat_glob.name.cur.wall_maxdur = start;\ -+ kstat_glob.name.cur.wall_tottime += start; \ -+ start -= VE_TASK_INFO(current)->sleep_time - \ -+ sleep_time; \ -+ if (kstat_glob.name.cur.cpu_maxdur < start) \ -+ kstat_glob.name.cur.cpu_maxdur = start; \ -+ kstat_glob.name.cur.cpu_tottime += start; \ -+ spin_unlock_irqrestore(&kstat_glb_lock, flags); \ -+ -+#else -+#define KSTAT_PERF_ENTER(name) -+#define KSTAT_PERF_LEAVE(name) -+#endif -+ -+/* -+ * Add another statistics reading. -+ * Serialization is the caller's due. -+ */ -+static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p, -+ cycles_t dur) -+{ -+ p->cur.count++; -+ if (p->cur.maxlat < dur) -+ p->cur.maxlat = dur; -+ p->cur.totlat += dur; -+} -+ -+static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, -+ cycles_t dur) -+{ -+ struct kstat_lat_pcpu_snap_struct *cur; -+ -+ cur = &p->cur[cpu]; -+ write_seqcount_begin(&cur->lock); -+ cur->count++; -+ if (cur->maxlat < dur) -+ cur->maxlat = dur; -+ cur->totlat += dur; -+ write_seqcount_end(&cur->lock); -+} -+ -+/* -+ * Move current statistics to last, clear last. -+ * Serialization is the caller's due. -+ */ -+static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p) -+{ -+ cycles_t m; -+ memcpy(&p->last, &p->cur, sizeof(p->last)); -+ p->cur.maxlat = 0; -+ m = p->last.maxlat; -+ CALC_LOAD(p->avg[0], EXP_1, m) -+ CALC_LOAD(p->avg[1], EXP_5, m) -+ CALC_LOAD(p->avg[2], EXP_15, m) -+} -+ -+static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p) -+{ -+ unsigned i, cpu; -+ struct kstat_lat_pcpu_snap_struct snap, *cur; -+ cycles_t m; -+ -+ memset(&p->last, 0, sizeof(p->last)); -+ for (cpu = 0; cpu < NR_CPUS; cpu++) { -+ cur = &p->cur[cpu]; -+ do { -+ i = read_seqcount_begin(&cur->lock); -+ memcpy(&snap, cur, sizeof(snap)); -+ } while (read_seqcount_retry(&cur->lock, i)); -+ /* -+ * read above and this update of maxlat is not atomic, -+ * but this is OK, since it happens rarely and losing -+ * a couple of peaks is not essential. xemul -+ */ -+ cur->maxlat = 0; -+ -+ p->last.count += snap.count; -+ p->last.totlat += snap.totlat; -+ if (p->last.maxlat < snap.maxlat) -+ p->last.maxlat = snap.maxlat; -+ } -+ -+ m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap); -+ CALC_LOAD(p->avg[0], EXP_1, m); -+ CALC_LOAD(p->avg[1], EXP_5, m); -+ CALC_LOAD(p->avg[2], EXP_15, m); -+ /* reset max_snap to calculate it correctly next time */ -+ p->max_snap = 0; -+} -+ -+#endif /* __VZSTAT_H__ */ -diff -upr linux-2.6.16.orig/include/net/af_unix.h linux-2.6.16-026test009/include/net/af_unix.h ---- linux-2.6.16.orig/include/net/af_unix.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/af_unix.h 2006-04-19 15:02:12.000000000 +0400 -@@ -19,23 +19,37 @@ extern atomic_t unix_tot_inflight; - - static inline struct sock *first_unix_socket(int *i) - { -+ struct sock *s; -+ struct ve_struct *ve; -+ -+ ve = get_exec_env(); - for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { -- if (!hlist_empty(&unix_socket_table[*i])) -- return __sk_head(&unix_socket_table[*i]); -+ for (s = sk_head(&unix_socket_table[*i]); -+ s != NULL && !ve_accessible(s->sk_owner_env, ve); -+ s = sk_next(s)); -+ if (s != NULL) -+ return s; - } - return NULL; - } - - static inline struct sock *next_unix_socket(int *i, struct sock *s) - { -- struct sock *next = sk_next(s); -- /* More in this chain? */ -- if (next) -- return next; -+ struct ve_struct *ve; -+ -+ ve = get_exec_env(); -+ for (s = sk_next(s); s != NULL; s = sk_next(s)) { -+ if (!ve_accessible(s->sk_owner_env, ve)) -+ continue; -+ return s; -+ } - /* Look for next non-empty chain. */ - for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { -- if (!hlist_empty(&unix_socket_table[*i])) -- return __sk_head(&unix_socket_table[*i]); -+ for (s = sk_head(&unix_socket_table[*i]); -+ s != NULL && !ve_accessible(s->sk_owner_env, ve); -+ s = sk_next(s)); -+ if (s != NULL) -+ return s; - } - return NULL; - } -diff -upr linux-2.6.16.orig/include/net/compat.h linux-2.6.16-026test009/include/net/compat.h ---- linux-2.6.16.orig/include/net/compat.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/compat.h 2006-04-19 15:02:11.000000000 +0400 -@@ -23,6 +23,14 @@ struct compat_cmsghdr { - compat_int_t cmsg_type; - }; - -+#if defined(CONFIG_X86_64) -+#define is_current_32bits() (current_thread_info()->flags & _TIF_IA32) -+#elif defined(CONFIG_IA64) -+#define is_current_32bits() (IS_IA32_PROCESS(ia64_task_regs(current))) -+#else -+#define is_current_32bits() 0 -+#endif -+ - #else /* defined(CONFIG_COMPAT) */ - #define compat_msghdr msghdr /* to avoid compiler warnings */ - #endif /* defined(CONFIG_COMPAT) */ -diff -upr linux-2.6.16.orig/include/net/flow.h linux-2.6.16-026test009/include/net/flow.h ---- linux-2.6.16.orig/include/net/flow.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/flow.h 2006-04-19 15:02:12.000000000 +0400 -@@ -10,6 +10,7 @@ - #include <linux/in6.h> - #include <asm/atomic.h> - -+struct ve_struct; - struct flowi { - int oif; - int iif; -@@ -78,6 +79,9 @@ struct flowi { - #define fl_icmp_type uli_u.icmpt.type - #define fl_icmp_code uli_u.icmpt.code - #define fl_ipsec_spi uli_u.spi -+#ifdef CONFIG_VE -+ struct ve_struct *owner_env; -+#endif - } __attribute__((__aligned__(BITS_PER_LONG/8))); - - #define FLOW_DIR_IN 0 -diff -upr linux-2.6.16.orig/include/net/icmp.h linux-2.6.16-026test009/include/net/icmp.h ---- linux-2.6.16.orig/include/net/icmp.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/icmp.h 2006-04-19 15:02:12.000000000 +0400 -@@ -31,9 +31,14 @@ struct icmp_err { - - extern struct icmp_err icmp_err_convert[]; - DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); --#define ICMP_INC_STATS(field) SNMP_INC_STATS(icmp_statistics, field) --#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) --#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ve_icmp_statistics (get_exec_env()->_icmp_statistics) -+#else -+#define ve_icmp_statistics icmp_statistics -+#endif -+#define ICMP_INC_STATS(field) SNMP_INC_STATS(ve_icmp_statistics, field) -+#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmp_statistics, field) -+#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmp_statistics, field) - - struct dst_entry; - struct net_proto_family; -diff -upr linux-2.6.16.orig/include/net/inet_hashtables.h linux-2.6.16-026test009/include/net/inet_hashtables.h ---- linux-2.6.16.orig/include/net/inet_hashtables.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/inet_hashtables.h 2006-04-19 15:02:12.000000000 +0400 -@@ -24,6 +24,7 @@ - #include <linux/spinlock.h> - #include <linux/types.h> - #include <linux/wait.h> -+#include <linux/ve_owner.h> - - #include <net/inet_connection_sock.h> - #include <net/inet_sock.h> -@@ -75,11 +76,13 @@ struct inet_ehash_bucket { - * ports are created in O(1) time? I thought so. ;-) -DaveM - */ - struct inet_bind_bucket { -+ struct ve_struct *owner_env; - unsigned short port; - signed short fastreuse; - struct hlist_node node; - struct hlist_head owners; - }; -+DCL_VE_OWNER_PROTO(TB, struct inet_bind_bucket, owner_env) - - #define inet_bind_bucket_for_each(tb, node, head) \ - hlist_for_each_entry(tb, node, head, node) -@@ -139,37 +142,43 @@ static inline struct inet_ehash_bucket * - extern struct inet_bind_bucket * - inet_bind_bucket_create(kmem_cache_t *cachep, - struct inet_bind_hashbucket *head, -- const unsigned short snum); -+ const unsigned short snum, -+ struct ve_struct *env); - extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, - struct inet_bind_bucket *tb); - --static inline int inet_bhashfn(const __u16 lport, const int bhash_size) -+static inline int inet_bhashfn(const __u16 lport, const int bhash_size, -+ unsigned veid) - { -- return lport & (bhash_size - 1); -+ return ((lport + (veid ^ (veid >> 16))) & (bhash_size - 1)); - } - - extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - const unsigned short snum); - - /* These can have wildcards, don't try too hard. */ --static inline int inet_lhashfn(const unsigned short num) -+static inline int inet_lhashfn(const unsigned short num, unsigned veid) - { -- return num & (INET_LHTABLE_SIZE - 1); -+ return ((num + (veid ^ (veid >> 16))) & (INET_LHTABLE_SIZE - 1)); - } - - static inline int inet_sk_listen_hashfn(const struct sock *sk) - { -- return inet_lhashfn(inet_sk(sk)->num); -+ return inet_lhashfn(inet_sk(sk)->num, VEID(VE_OWNER_SK(sk))); - } - - /* Caller must disable local BH processing. */ - static inline void __inet_inherit_port(struct inet_hashinfo *table, - struct sock *sk, struct sock *child) - { -- const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); -- struct inet_bind_hashbucket *head = &table->bhash[bhash]; -+ int bhash; -+ struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - -+ bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size, -+ VEID(VE_OWNER_SK(child))); -+ head = &table->bhash[bhash]; -+ - spin_lock(&head->lock); - tb = inet_csk(sk)->icsk_bind_hash; - sk_add_bind_node(child, &tb->owners); -@@ -275,7 +284,8 @@ static inline int inet_iif(const struct - extern struct sock *__inet_lookup_listener(const struct hlist_head *head, - const u32 daddr, - const unsigned short hnum, -- const int dif); -+ const int dif, -+ struct ve_struct *env); - - /* Optimize the common listener case. */ - static inline struct sock * -@@ -285,18 +295,21 @@ static inline struct sock * - { - struct sock *sk = NULL; - const struct hlist_head *head; -+ struct ve_struct *env; - -+ env = get_exec_env(); - read_lock(&hashinfo->lhash_lock); -- head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; -+ head = &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))]; - if (!hlist_empty(head)) { - const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); - - if (inet->num == hnum && !sk->sk_node.next && -+ ve_accessible_strict(VE_OWNER_SK(sk), env) && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && - (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - !sk->sk_bound_dev_if) - goto sherry_cache; -- sk = __inet_lookup_listener(head, daddr, hnum, dif); -+ sk = __inet_lookup_listener(head, daddr, hnum, dif, env); - } - if (sk) { - sherry_cache: -@@ -323,25 +336,25 @@ sherry_cache: - #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ - const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr)); - #endif /* __BIG_ENDIAN */ --#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ -+#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ - (((__sk)->sk_hash == (__hash)) && \ - ((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ - ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) --#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ -+#define INET_TW_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ - (((__sk)->sk_hash == (__hash)) && \ - ((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ - ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) - #else /* 32-bit arch */ - #define INET_ADDR_COOKIE(__name, __saddr, __daddr) --#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ -+#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ - (((__sk)->sk_hash == (__hash)) && \ - (inet_sk(__sk)->daddr == (__saddr)) && \ - (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ - ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) --#define INET_TW_MATCH(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ -+#define INET_TW_MATCH_ALLVE(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ - (((__sk)->sk_hash == (__hash)) && \ - (inet_twsk(__sk)->tw_daddr == (__saddr)) && \ - (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ -@@ -349,6 +362,18 @@ sherry_cache: - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) - #endif /* 64-bit arch */ - -+#define INET_MATCH(__sk, __hash, __cookie, __saddr, \ -+ __daddr, __ports, __dif, __ve) \ -+ (INET_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ -+ (__daddr), (__ports), (__dif)) \ -+ && ve_accessible_strict(VE_OWNER_SK(__sk), (__ve))) -+ -+#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, \ -+ __daddr, __ports, __dif, __ve) \ -+ (INET_TW_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ -+ (__daddr), (__ports), (__dif)) \ -+ && ve_accessible_strict(inet_twsk(__sk)->tw_owner_env, VEID(__ve))) -+ - /* - * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need - * not check it for lookups anymore, thanks Alexey. -DaveM -@@ -368,19 +393,25 @@ static inline struct sock * - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ -- unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); -- struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); -- -+ unsigned int hash; -+ struct inet_ehash_bucket *head; -+ struct ve_struct *env; -+ -+ env = get_exec_env(); -+ hash = inet_ehashfn(daddr, hnum, saddr, sport, VEID(env)); -+ head = inet_ehash_bucket(hashinfo, hash); - prefetch(head->chain.first); - read_lock(&head->lock); - sk_for_each(sk, node, &head->chain) { -- if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) -+ if (INET_MATCH(sk, hash, acookie, saddr, daddr, -+ ports, dif, env)) - goto hit; /* You sunk my battleship! */ - } - - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { -- if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) -+ if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, -+ ports, dif, env)) - goto hit; - } - sk = NULL; -diff -upr linux-2.6.16.orig/include/net/inet_sock.h linux-2.6.16-026test009/include/net/inet_sock.h ---- linux-2.6.16.orig/include/net/inet_sock.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/inet_sock.h 2006-04-19 15:02:12.000000000 +0400 -@@ -171,9 +171,10 @@ static inline void inet_sk_copy_descenda - extern int inet_sk_rebuild_header(struct sock *sk); - - static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, -- const __u32 faddr, const __u16 fport) -+ const __u32 faddr, const __u16 fport, -+ const envid_t veid) - { -- unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); -+ int h = (laddr ^ lport) ^ (faddr ^ fport) ^ (veid ^ (veid >> 16)); - h ^= h >> 16; - h ^= h >> 8; - return h; -@@ -186,8 +187,9 @@ static inline int inet_sk_ehashfn(const - const __u16 lport = inet->num; - const __u32 faddr = inet->daddr; - const __u16 fport = inet->dport; -+ envid_t veid = VEID(VE_OWNER_SK(sk)); - -- return inet_ehashfn(laddr, lport, faddr, fport); -+ return inet_ehashfn(laddr, lport, faddr, fport, veid); - } - - #endif /* _INET_SOCK_H */ -diff -upr linux-2.6.16.orig/include/net/inet_timewait_sock.h linux-2.6.16-026test009/include/net/inet_timewait_sock.h ---- linux-2.6.16.orig/include/net/inet_timewait_sock.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/inet_timewait_sock.h 2006-04-19 15:02:12.000000000 +0400 -@@ -134,6 +134,7 @@ struct inet_timewait_sock { - unsigned long tw_ttd; - struct inet_bind_bucket *tw_tb; - struct hlist_node tw_death_node; -+ envid_t tw_owner_env; - }; - - static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, -diff -upr linux-2.6.16.orig/include/net/ip.h linux-2.6.16-026test009/include/net/ip.h ---- linux-2.6.16.orig/include/net/ip.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/ip.h 2006-04-19 15:02:12.000000000 +0400 -@@ -95,6 +95,7 @@ extern int ip_local_deliver(struct sk_b - extern int ip_mr_input(struct sk_buff *skb); - extern int ip_output(struct sk_buff *skb); - extern int ip_mc_output(struct sk_buff *skb); -+extern int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); - extern int ip_do_nat(struct sk_buff *skb); - extern void ip_send_check(struct iphdr *ip); - extern int ip_queue_xmit(struct sk_buff *skb, int ipfragok); -@@ -152,15 +153,25 @@ struct ipv4_config - - extern struct ipv4_config ipv4_config; - DECLARE_SNMP_STAT(struct ipstats_mib, ip_statistics); --#define IP_INC_STATS(field) SNMP_INC_STATS(ip_statistics, field) --#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ip_statistics, field) --#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ip_statistics, field) -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ve_ip_statistics (get_exec_env()->_ip_statistics) -+#else -+#define ve_ip_statistics ip_statistics -+#endif -+#define IP_INC_STATS(field) SNMP_INC_STATS(ve_ip_statistics, field) -+#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ip_statistics, field) -+#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ip_statistics, field) - DECLARE_SNMP_STAT(struct linux_mib, net_statistics); --#define NET_INC_STATS(field) SNMP_INC_STATS(net_statistics, field) --#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(net_statistics, field) --#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(net_statistics, field) --#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(net_statistics, field, adnd) --#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(net_statistics, field, adnd) -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ve_net_statistics (get_exec_env()->_net_statistics) -+#else -+#define ve_net_statistics net_statistics -+#endif -+#define NET_INC_STATS(field) SNMP_INC_STATS(ve_net_statistics, field) -+#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_net_statistics, field) -+#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_net_statistics, field) -+#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(ve_net_statistics, field, adnd) -+#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(ve_net_statistics, field, adnd) - - extern int sysctl_local_port_range[2]; - extern int sysctl_ip_default_ttl; -@@ -380,4 +391,11 @@ extern int ip_misc_proc_init(void); - - extern struct ctl_table ipv4_table[]; - -+#ifdef CONFIG_SYSCTL -+extern int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, -+ void __user *buffer, size_t *lenp, loff_t *ppos); -+extern int ipv4_sysctl_forward_strategy(ctl_table *table, int __user *name, -+ int nlen, void __user *oldval, size_t __user *oldlenp, -+ void __user *newval, size_t newlen, void **context); -+#endif - #endif /* _IP_H */ -diff -upr linux-2.6.16.orig/include/net/ip_fib.h linux-2.6.16-026test009/include/net/ip_fib.h ---- linux-2.6.16.orig/include/net/ip_fib.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/ip_fib.h 2006-04-19 15:02:12.000000000 +0400 -@@ -168,10 +168,22 @@ struct fib_table { - unsigned char tb_data[0]; - }; - -+struct fn_zone; -+struct fn_hash -+{ -+ struct fn_zone *fn_zones[33]; -+ struct fn_zone *fn_zone_list; -+}; -+ - #ifndef CONFIG_IP_MULTIPLE_TABLES - -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ip_fib_local_table get_exec_env()->_local_table -+#define ip_fib_main_table get_exec_env()->_main_table -+#else - extern struct fib_table *ip_fib_local_table; - extern struct fib_table *ip_fib_main_table; -+#endif - - static inline struct fib_table *fib_get_table(int id) - { -@@ -203,7 +215,12 @@ static inline void fib_select_default(co - #define ip_fib_local_table (fib_tables[RT_TABLE_LOCAL]) - #define ip_fib_main_table (fib_tables[RT_TABLE_MAIN]) - -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define fib_tables get_exec_env()->_fib_tables -+#else - extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; -+#endif -+ - extern int fib_lookup(const struct flowi *flp, struct fib_result *res); - extern struct fib_table *__fib_new_table(int id); - extern void fib_rule_put(struct fib_rule *r); -@@ -250,10 +267,19 @@ extern u32 __fib_res_prefsrc(struct fib - - /* Exported by fib_hash.c */ - extern struct fib_table *fib_hash_init(int id); -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+struct ve_struct; -+extern int init_ve_route(struct ve_struct *ve); -+extern void fini_ve_route(struct ve_struct *ve); -+#else -+#define init_ve_route(ve) (0) -+#define fini_ve_route(ve) do { } while (0) -+#endif - - #ifdef CONFIG_IP_MULTIPLE_TABLES - /* Exported by fib_rules.c */ -- -+extern int fib_rules_create(void); -+extern void fib_rules_destroy(void); - extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); - extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); - extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb); -diff -upr linux-2.6.16.orig/include/net/netlink_sock.h linux-2.6.16-026test009/include/net/netlink_sock.h ---- linux-2.6.16.orig/include/net/netlink_sock.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/netlink_sock.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,22 @@ -+#ifndef __NET_NETLINK_SOCK_H -+#define __NET_NETLINK_SOCK_H -+ -+struct netlink_sock { -+ /* struct sock has to be the first member of netlink_sock */ -+ struct sock sk; -+ u32 pid; -+ u32 dst_pid; -+ u32 dst_group; -+ u32 flags; -+ u32 subscriptions; -+ u32 ngroups; -+ unsigned long *groups; -+ unsigned long state; -+ wait_queue_head_t wait; -+ struct netlink_callback *cb; -+ spinlock_t cb_lock; -+ void (*data_ready)(struct sock *sk, int bytes); -+ struct module *module; -+}; -+ -+#endif /* __NET_NETLINK_SOCK_H */ -diff -upr linux-2.6.16.orig/include/net/route.h linux-2.6.16-026test009/include/net/route.h ---- linux-2.6.16.orig/include/net/route.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/route.h 2006-04-19 15:02:12.000000000 +0400 -@@ -201,4 +201,14 @@ static inline struct inet_peer *rt_get_p - - extern ctl_table ipv4_route_table[]; - -+#ifdef CONFIG_SYSCTL -+extern int ipv4_flush_delay; -+extern int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, -+ struct file *filp, void __user *buffer, size_t *lenp, -+ loff_t *ppos); -+extern int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, -+ int __user *name, int nlen, void __user *oldval, -+ size_t __user *oldlenp, void __user *newval, -+ size_t newlen, void **context); -+#endif - #endif /* _ROUTE_H */ -diff -upr linux-2.6.16.orig/include/net/scm.h linux-2.6.16-026test009/include/net/scm.h ---- linux-2.6.16.orig/include/net/scm.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/scm.h 2006-04-19 15:02:12.000000000 +0400 -@@ -40,7 +40,7 @@ static __inline__ int scm_send(struct so - memset(scm, 0, sizeof(*scm)); - scm->creds.uid = current->uid; - scm->creds.gid = current->gid; -- scm->creds.pid = current->tgid; -+ scm->creds.pid = virt_tgid(current); - if (msg->msg_controllen <= 0) - return 0; - return __scm_send(sock, msg, scm); -diff -upr linux-2.6.16.orig/include/net/sock.h linux-2.6.16-026test009/include/net/sock.h ---- linux-2.6.16.orig/include/net/sock.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/sock.h 2006-04-19 15:02:12.000000000 +0400 -@@ -55,6 +55,8 @@ - #include <net/dst.h> - #include <net/checksum.h> - -+#include <ub/ub_net.h> -+ - /* - * This structure really needs to be cleaned up. - * Most of it is for TCP, and not used by any of -@@ -251,8 +253,12 @@ struct sock { - int (*sk_backlog_rcv)(struct sock *sk, - struct sk_buff *skb); - void (*sk_destruct)(struct sock *sk); -+ struct sock_beancounter sk_bc; -+ struct ve_struct *sk_owner_env; - }; - -+DCL_VE_OWNER_PROTO(SK, struct sock, sk_owner_env) -+ - /* - * Hashed lists helper routines - */ -@@ -485,7 +491,8 @@ static inline void sk_add_backlog(struct - }) - - extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); --extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); -+extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p, -+ unsigned long amount); - extern void sk_stream_wait_close(struct sock *sk, long timeo_p); - extern int sk_stream_error(struct sock *sk, int flags, int err); - extern void sk_stream_kill_queues(struct sock *sk); -@@ -706,8 +713,11 @@ static inline void sk_stream_writequeue_ - - static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) - { -- return (int)skb->truesize <= sk->sk_forward_alloc || -- sk_stream_mem_schedule(sk, skb->truesize, 1); -+ if ((int)skb->truesize > sk->sk_forward_alloc && -+ !sk_stream_mem_schedule(sk, skb->truesize, 1)) -+ /* The situation is bad according to mainstream. Den */ -+ return 0; -+ return ub_tcprcvbuf_charge(sk, skb) == 0; - } - - static inline int sk_stream_wmem_schedule(struct sock *sk, int size) -@@ -765,6 +775,11 @@ extern struct sk_buff *sock_alloc_send - unsigned long size, - int noblock, - int *errcode); -+extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk, -+ unsigned long size, -+ unsigned long size2, -+ int noblock, -+ int *errcode); - extern void *sock_kmalloc(struct sock *sk, int size, - gfp_t priority); - extern void sock_kfree_s(struct sock *sk, void *mem, int size); -@@ -1142,6 +1157,10 @@ static inline int sock_queue_rcv_skb(str - goto out; - } - -+ err = ub_sockrcvbuf_charge(sk, skb); -+ if (err < 0) -+ goto out; -+ - /* It would be deadlock, if sock_queue_rcv_skb is used - with socket lock! We assume that users of this - function are lock free. -diff -upr linux-2.6.16.orig/include/net/tcp.h linux-2.6.16-026test009/include/net/tcp.h ---- linux-2.6.16.orig/include/net/tcp.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/tcp.h 2006-04-19 15:02:12.000000000 +0400 -@@ -40,6 +40,7 @@ - #include <net/tcp_states.h> - - #include <linux/seq_file.h> -+#include <ub/ub_net.h> - - extern struct inet_hashinfo tcp_hashinfo; - -@@ -250,12 +251,17 @@ static inline int between(__u32 seq1, __ - extern struct proto tcp_prot; - - DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics); --#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field) --#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field) --#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field) --#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field) --#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) --#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ve_tcp_statistics (get_exec_env()->_tcp_statistics) -+#else -+#define ve_tcp_statistics tcp_statistics -+#endif -+#define TCP_INC_STATS(field) SNMP_INC_STATS(ve_tcp_statistics, field) -+#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_tcp_statistics, field) -+#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_tcp_statistics, field) -+#define TCP_DEC_STATS(field) SNMP_DEC_STATS(ve_tcp_statistics, field) -+#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_tcp_statistics, field, val) -+#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(ve_tcp_statistics, field, val) - - extern void tcp_v4_err(struct sk_buff *skb, u32); - -@@ -493,7 +499,7 @@ extern u32 __tcp_select_window(struct so - * to use only the low 32-bits of jiffies and hide the ugly - * casts with the following macro. - */ --#define tcp_time_stamp ((__u32)(jiffies)) -+#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup)) - - /* This is what the send packet queuing engine uses to pass - * TCP per-packet control information to the transmission -diff -upr linux-2.6.16.orig/include/net/udp.h linux-2.6.16-026test009/include/net/udp.h ---- linux-2.6.16.orig/include/net/udp.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/include/net/udp.h 2006-04-19 15:02:12.000000000 +0400 -@@ -39,13 +39,19 @@ extern rwlock_t udp_hash_lock; - - extern int udp_port_rover; - --static inline int udp_lport_inuse(u16 num) -+static inline int udp_hashfn(u16 num, unsigned veid) -+{ -+ return ((num + (veid ^ (veid >> 16))) & (UDP_HTABLE_SIZE - 1)); -+} -+ -+static inline int udp_lport_inuse(u16 num, struct ve_struct *env) - { - struct sock *sk; - struct hlist_node *node; - -- sk_for_each(sk, node, &udp_hash[num & (UDP_HTABLE_SIZE - 1)]) -- if (inet_sk(sk)->num == num) -+ sk_for_each(sk, node, &udp_hash[udp_hashfn(num, VEID(env))]) -+ if (inet_sk(sk)->num == num && -+ ve_accessible_strict(sk->sk_owner_env, env)) - return 1; - return 0; - } -@@ -75,9 +81,14 @@ extern unsigned int udp_poll(struct file - poll_table *wait); - - DECLARE_SNMP_STAT(struct udp_mib, udp_statistics); --#define UDP_INC_STATS(field) SNMP_INC_STATS(udp_statistics, field) --#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_statistics, field) --#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_statistics, field) -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ve_udp_statistics (get_exec_env()->_udp_statistics) -+#else -+#define ve_udp_statistics udp_statistics -+#endif -+#define UDP_INC_STATS(field) SNMP_INC_STATS(ve_udp_statistics, field) -+#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_udp_statistics, field) -+#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_udp_statistics, field) - - /* /proc */ - struct udp_seq_afinfo { -diff -upr linux-2.6.16.orig/include/ub/beancounter.h linux-2.6.16-026test009/include/ub/beancounter.h ---- linux-2.6.16.orig/include/ub/beancounter.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/beancounter.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,329 @@ -+/* -+ * include/ub/beancounter.h -+ * -+ * Copyright (C) 1999-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * Andrey Savochkin saw@sw-soft.com -+ * -+ */ -+ -+#ifndef _LINUX_BEANCOUNTER_H -+#define _LINUX_BEANCOUNTER_H -+ -+#include <linux/config.h> -+ -+/* -+ * Generic ratelimiting stuff. -+ */ -+ -+struct ub_rate_info { -+ int burst; -+ int interval; /* jiffy_t per event */ -+ int bucket; /* kind of leaky bucket */ -+ unsigned long last; /* last event */ -+}; -+ -+/* Return true if rate limit permits. */ -+int ub_ratelimit(struct ub_rate_info *); -+ -+ -+/* -+ * This magic is used to distinuish user beancounter and pages beancounter -+ * in struct page. page_ub and page_bc are placed in union and MAGIC -+ * ensures us that we don't use pbc as ubc in ub_page_uncharge(). -+ */ -+#define UB_MAGIC 0x62756275 -+ -+/* -+ * Resource list. -+ */ -+ -+#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including -+ * struct task, page directories, etc. -+ */ -+#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */ -+#define UB_PRIVVMPAGES 2 /* Total number of pages, counting potentially -+ * private pages as private and used. -+ */ -+#define UB_SHMPAGES 3 /* IPC SHM segment size. */ -+#define UB_ZSHMPAGES 4 /* Anonymous shared memory. */ -+#define UB_NUMPROC 5 /* Number of processes. */ -+#define UB_PHYSPAGES 6 /* All resident pages, for swapout guarantee. */ -+#define UB_VMGUARPAGES 7 /* Guarantee for memory allocation, -+ * checked against PRIVVMPAGES. -+ */ -+#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill. -+ * Only limit is used, no accounting. -+ */ -+#define UB_NUMTCPSOCK 9 /* Number of TCP sockets. */ -+#define UB_NUMFLOCK 10 /* Number of file locks. */ -+#define UB_NUMPTY 11 /* Number of PTYs. */ -+#define UB_NUMSIGINFO 12 /* Number of siginfos. */ -+#define UB_TCPSNDBUF 13 /* Total size of tcp send buffers. */ -+#define UB_TCPRCVBUF 14 /* Total size of tcp receive buffers. */ -+#define UB_OTHERSOCKBUF 15 /* Total size of other socket -+ * send buffers (all buffers for PF_UNIX). -+ */ -+#define UB_DGRAMRCVBUF 16 /* Total size of other socket -+ * receive buffers. -+ */ -+#define UB_NUMOTHERSOCK 17 /* Number of other sockets. */ -+#define UB_DCACHESIZE 18 /* Size of busy dentry/inode cache. */ -+#define UB_NUMFILE 19 /* Number of open files. */ -+ -+#define UB_RESOURCES 24 -+ -+#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0) -+#define UB_TMPFSPAGES (UB_RESOURCES + 1) -+#define UB_SWAPPAGES (UB_RESOURCES + 2) -+#define UB_HELDPAGES (UB_RESOURCES + 3) -+ -+struct ubparm { -+ /* -+ * A barrier over which resource allocations are failed gracefully. -+ * If the amount of consumed memory is over the barrier further sbrk() -+ * or mmap() calls fail, the existing processes are not killed. -+ */ -+ unsigned long barrier; -+ /* hard resource limit */ -+ unsigned long limit; -+ /* consumed resources */ -+ unsigned long held; -+ /* maximum amount of consumed resources through the last period */ -+ unsigned long maxheld; -+ /* minimum amount of consumed resources through the last period */ -+ unsigned long minheld; -+ /* count of failed charges */ -+ unsigned long failcnt; -+}; -+ -+/* -+ * Kernel internal part. -+ */ -+ -+#ifdef __KERNEL__ -+ -+#include <ub/ub_debug.h> -+#include <linux/interrupt.h> -+#include <asm/atomic.h> -+#include <linux/spinlock.h> -+#include <linux/cache.h> -+#include <linux/threads.h> -+ -+/* -+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. -+ */ -+#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) -+ -+ -+/* -+ * Resource management structures -+ * Serialization issues: -+ * beancounter list management is protected via ub_hash_lock -+ * task pointers are set only for current task and only once -+ * refcount is managed atomically -+ * value and limit comparison and change are protected by per-ub spinlock -+ */ -+ -+struct page_beancounter; -+struct task_beancounter; -+struct sock_beancounter; -+ -+struct page_private { -+ unsigned long ubp_unused_privvmpages; -+ unsigned long ubp_tmpfs_respages; -+ unsigned long ubp_swap_pages; -+ unsigned long long ubp_held_pages; -+}; -+ -+struct sock_private { -+ unsigned long ubp_rmem_thres; -+ unsigned long ubp_wmem_pressure; -+ unsigned long ubp_maxadvmss; -+ unsigned long ubp_rmem_pressure; -+#define UB_RMEM_EXPAND 0 -+#define UB_RMEM_KEEP 1 -+#define UB_RMEM_SHRINK 2 -+ struct list_head ubp_other_socks; -+ struct list_head ubp_tcp_socks; -+ atomic_t ubp_orphan_count; -+}; -+ -+struct ub_perfstat { -+ unsigned long unmap; -+ unsigned long swapin; -+ -+#ifdef CONFIG_UBC_DEBUG_KMEM -+ long pages_charged; -+ long vmalloc_charged; -+ long pbcs; -+#endif -+} ____cacheline_aligned_in_smp; -+ -+struct user_beancounter -+{ -+ unsigned long ub_magic; -+ atomic_t ub_refcount; -+ struct user_beancounter *ub_next; -+ spinlock_t ub_lock; -+ uid_t ub_uid; -+ -+ struct ub_rate_info ub_limit_rl; -+ int ub_oom_noproc; -+ -+ struct page_private ppriv; -+#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages -+#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages -+#define ub_swap_pages ppriv.ubp_swap_pages -+#define ub_held_pages ppriv.ubp_held_pages -+ struct sock_private spriv; -+#define ub_rmem_thres spriv.ubp_rmem_thres -+#define ub_maxadvmss spriv.ubp_maxadvmss -+#define ub_rmem_pressure spriv.ubp_rmem_pressure -+#define ub_wmem_pressure spriv.ubp_wmem_pressure -+#define ub_tcp_sk_list spriv.ubp_tcp_socks -+#define ub_other_sk_list spriv.ubp_other_socks -+#define ub_orphan_count spriv.ubp_orphan_count -+ -+ struct user_beancounter *parent; -+ void *private_data; -+ -+ /* resources statistic and settings */ -+ struct ubparm ub_parms[UB_RESOURCES]; -+ /* resources statistic for last interval */ -+ struct ubparm ub_store[UB_RESOURCES]; -+ -+ struct ub_perfstat ub_stat[NR_CPUS]; -+ -+#ifdef CONFIG_UBC_DEBUG_KMEM -+ struct list_head ub_cclist; -+#endif -+}; -+ -+enum severity { UB_HARD, UB_SOFT, UB_FORCE }; -+ -+static inline int ub_barrier_hit(struct user_beancounter *ub, int resource) -+{ -+ return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier; -+} -+ -+static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource) -+{ -+ return (ub->ub_parms[resource].held > -+ ((ub->ub_parms[resource].barrier) >> 1)); -+} -+ -+#ifndef CONFIG_USER_RESOURCE -+ -+extern inline struct user_beancounter *get_beancounter_byuid -+ (uid_t uid, int create) { return NULL; } -+extern inline struct user_beancounter *get_beancounter -+ (struct user_beancounter *ub) { return NULL; } -+extern inline void put_beancounter(struct user_beancounter *ub) {;} -+ -+static inline void ub_init_cache(unsigned long mempages) { }; -+static inline void ub_init_ub0(void) { }; -+ -+#define get_ub0() NULL -+ -+#else /* CONFIG_USER_RESOURCE */ -+ -+/* -+ * Charge/uncharge operations -+ */ -+ -+extern int __charge_beancounter_locked(struct user_beancounter *ub, -+ int resource, unsigned long val, enum severity strict); -+ -+extern void __uncharge_beancounter_locked(struct user_beancounter *ub, -+ int resource, unsigned long val); -+ -+extern void __put_beancounter(struct user_beancounter *ub); -+ -+extern void uncharge_warn(struct user_beancounter *ub, int resource, -+ unsigned long val, unsigned long held); -+ -+extern const char *ub_rnames[]; -+/* -+ * Put a beancounter reference -+ */ -+ -+static inline void put_beancounter(struct user_beancounter *ub) -+{ -+ if (unlikely(ub == NULL)) -+ return; -+ -+ __put_beancounter(ub); -+} -+ -+/* -+ * Create a new beancounter reference -+ */ -+extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create); -+ -+static inline -+struct user_beancounter *get_beancounter(struct user_beancounter *ub) -+{ -+ if (unlikely(ub == NULL)) -+ return NULL; -+ -+ atomic_inc(&ub->ub_refcount); -+ return ub; -+} -+ -+extern struct user_beancounter *get_subbeancounter_byid( -+ struct user_beancounter *, -+ int id, int create); -+extern struct user_beancounter *subbeancounter_findcreate( -+ struct user_beancounter *p, int id); -+ -+extern struct user_beancounter ub0; -+ -+extern void ub_init_cache(unsigned long); -+extern void ub_init_ub0(void); -+#define get_ub0() (&ub0) -+ -+extern void print_ub_uid(struct user_beancounter *ub, char *buf, int size); -+ -+/* -+ * Resource charging -+ * Change user's account and compare against limits -+ */ -+ -+static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource) -+{ -+ if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held) -+ ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held; -+ if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held) -+ ub->ub_parms[resource].minheld = ub->ub_parms[resource].held; -+} -+ -+#endif /* CONFIG_USER_RESOURCE */ -+ -+#include <ub/ub_decl.h> -+UB_DECLARE_FUNC(int, charge_beancounter(struct user_beancounter *ub, -+ int resource, unsigned long val, enum severity strict)); -+UB_DECLARE_VOID_FUNC(uncharge_beancounter(struct user_beancounter *ub, -+ int resource, unsigned long val)); -+ -+UB_DECLARE_VOID_FUNC(charge_beancounter_notop(struct user_beancounter *ub, -+ int resource, unsigned long val)); -+UB_DECLARE_VOID_FUNC(uncharge_beancounter_notop(struct user_beancounter *ub, -+ int resource, unsigned long val)); -+ -+#ifndef CONFIG_USER_RESOURCE_PROC -+static inline void ub_init_proc(void) { }; -+#else -+extern void ub_init_proc(void); -+#endif -+ -+#ifdef CONFIG_USER_RSS_ACCOUNTING -+extern void ub_init_pbc(void); -+#else -+static inline void ub_ini_pbc(void) { } -+#endif -+#endif /* __KERNEL__ */ -+#endif /* _LINUX_BEANCOUNTER_H */ -diff -upr linux-2.6.16.orig/include/ub/ub_dcache.h linux-2.6.16-026test009/include/ub/ub_dcache.h ---- linux-2.6.16.orig/include/ub/ub_dcache.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_dcache.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,57 @@ -+/* -+ * include/ub/ub_dcache.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_DCACHE_H_ -+#define __UB_DCACHE_H_ -+ -+#include <ub/ub_decl.h> -+ -+/* -+ * UB_DCACHESIZE accounting -+ */ -+ -+struct dentry_beancounter -+{ -+ /* -+ * d_inuse = -+ * <number of external refs> + -+ * <number of 'used' childs> -+ * -+ * d_inuse == -1 means that dentry is unused -+ * state change -1 => 0 causes charge -+ * state change 0 => -1 causes uncharge -+ */ -+ atomic_t d_inuse; -+ /* charged size, including name length if name is not inline */ -+ unsigned long d_ubsize; -+ struct user_beancounter *d_ub; -+}; -+ -+struct dentry; -+ -+UB_DECLARE_FUNC(int, ub_dentry_alloc(struct dentry *d)) -+UB_DECLARE_VOID_FUNC(ub_dentry_charge_nofail(struct dentry *d)) -+UB_DECLARE_VOID_FUNC(ub_dentry_uncharge(struct dentry *d)) -+ -+#ifdef CONFIG_USER_RESOURCE -+UB_DECLARE_FUNC(int, ub_dentry_charge(struct dentry *d)) -+#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse)) -+#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse)) -+#define INUSE_INIT 0 -+#else -+#define ub_dentry_charge(d) ({ \ -+ spin_unlock(&d->d_lock); \ -+ rcu_read_unlock(); \ -+ 0; \ -+ }) -+#define ub_dget_testone(d) (0) -+#define ub_dput_testzero(d) (0) -+#endif -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_debug.h linux-2.6.16-026test009/include/ub/ub_debug.h ---- linux-2.6.16.orig/include/ub/ub_debug.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_debug.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,95 @@ -+/* -+ * include/ub/ub_debug.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_DEBUG_H_ -+#define __UB_DEBUG_H_ -+ -+/* -+ * general debugging -+ */ -+ -+#define UBD_ALLOC 0x1 -+#define UBD_CHARGE 0x2 -+#define UBD_LIMIT 0x4 -+#define UBD_TRACE 0x8 -+ -+/* -+ * ub_net debugging -+ */ -+ -+#define UBD_NET_SOCKET 0x10 -+#define UBD_NET_SLEEP 0x20 -+#define UBD_NET_SEND 0x40 -+#define UBD_NET_RECV 0x80 -+ -+/* -+ * Main routines -+ */ -+ -+#define UB_DEBUG (0) -+#define DEBUG_RESOURCE (0ULL) -+ -+#define ub_dbg_cond(__cond, __str, args...) \ -+ do { \ -+ if ((__cond) != 0) \ -+ printk(__str, ##args); \ -+ } while(0) -+ -+#define ub_debug(__section, __str, args...) \ -+ ub_dbg_cond(UB_DEBUG & (__section), __str, ##args) -+ -+#define ub_debug_resource(__resource, __str, args...) \ -+ ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \ -+ (DEBUG_RESOURCE & (1 << (__resource))), \ -+ __str, ##args) -+ -+#if UB_DEBUG & UBD_TRACE -+#define ub_debug_trace(__cond, __b, __r) \ -+ do { \ -+ static struct ub_rate_info ri = { __b, __r }; \ -+ if ((__cond) != 0 && ub_ratelimit(&ri)) \ -+ dump_stack(); \ -+ } while(0) -+#else -+#define ub_debug_trace(__cond, __burst, __rate) -+#endif -+ -+#include <linux/config.h> -+ -+#ifdef CONFIG_UBC_DEBUG_KMEM -+#include <linux/list.h> -+#include <linux/kmem_cache.h> -+ -+struct user_beancounter; -+struct ub_cache_counter { -+ struct list_head ulist; -+ struct ub_cache_counter *next; -+ struct user_beancounter *ub; -+ kmem_cache_t *cachep; -+ unsigned long counter; -+}; -+ -+extern spinlock_t cc_lock; -+extern void init_cache_counters(void); -+extern void ub_free_counters(struct user_beancounter *); -+extern void ub_kmemcache_free(kmem_cache_t *cachep); -+ -+struct vm_struct; -+extern void inc_vmalloc_charged(struct vm_struct *, int); -+extern void dec_vmalloc_charged(struct vm_struct *); -+#else -+#define init_cache_counters() do { } while (0) -+#define inc_vmalloc_charged(vm, f) do { } while (0) -+#define dec_vmalloc_charged(vm) do { } while (0) -+#define ub_free_counters(ub) do { } while (0) -+#define ub_kmemcache_free(cachep) do { } while (0) -+#endif -+ -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_decl.h linux-2.6.16-026test009/include/ub/ub_decl.h ---- linux-2.6.16.orig/include/ub/ub_decl.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_decl.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,40 @@ -+/* -+ * include/ub/ub_decl.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_DECL_H_ -+#define __UB_DECL_H_ -+ -+#include <linux/config.h> -+ -+/* -+ * Naming convension: -+ * ub_<section|object>_<operation> -+ */ -+ -+#ifdef CONFIG_USER_RESOURCE -+ -+#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl; -+#define UB_DECLARE_VOID_FUNC(decl) extern void decl; -+ -+#else /* CONFIG_USER_RESOURCE */ -+ -+#define UB_DECLARE_FUNC(ret_type, decl) \ -+ static inline ret_type decl \ -+ { \ -+ return (ret_type)0; \ -+ } -+#define UB_DECLARE_VOID_FUNC(decl) \ -+ static inline void decl \ -+ { \ -+ } -+ -+#endif /* CONFIG_USER_RESOURCE */ -+ -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_hash.h linux-2.6.16-026test009/include/ub/ub_hash.h ---- linux-2.6.16.orig/include/ub/ub_hash.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_hash.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,41 @@ -+/* -+ * include/ub/ub_hash.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef _LINUX_UBHASH_H -+#define _LINUX_UBHASH_H -+ -+#ifdef __KERNEL__ -+ -+#define UB_HASH_SIZE 256 -+ -+struct ub_hash_slot { -+ struct user_beancounter *ubh_beans; -+}; -+ -+extern struct ub_hash_slot ub_hash[]; -+extern spinlock_t ub_hash_lock; -+ -+#ifdef CONFIG_USER_RESOURCE -+ -+/* -+ * Iterate over beancounters -+ * @__slot - hash slot -+ * @__ubp - beancounter ptr -+ * Can use break :) -+ */ -+#define for_each_beancounter(__slot, __ubp) \ -+ for (__slot = 0, __ubp = NULL; \ -+ __slot < UB_HASH_SIZE && __ubp == NULL; __slot++) \ -+ for (__ubp = ub_hash[__slot].ubh_beans; __ubp; \ -+ __ubp = __ubp->ub_next) -+ -+#endif /* CONFIG_USER_RESOURCE */ -+#endif /* __KERNEL__ */ -+#endif /* _LINUX_UBHASH_H */ -diff -upr linux-2.6.16.orig/include/ub/ub_mem.h linux-2.6.16-026test009/include/ub/ub_mem.h ---- linux-2.6.16.orig/include/ub/ub_mem.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_mem.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,85 @@ -+/* -+ * include/ub/ub_mem.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_SLAB_H_ -+#define __UB_SLAB_H_ -+ -+#include <linux/config.h> -+#include <linux/kmem_slab.h> -+#include <ub/beancounter.h> -+#include <ub/ub_decl.h> -+ -+/* -+ * UB_KMEMSIZE accounting -+ */ -+ -+#ifdef CONFIG_UBC_DEBUG_ITEMS -+#define CHARGE_ORDER(__o) (1 << __o) -+#define CHARGE_SIZE(__s) 1 -+#else -+#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o)) -+#define CHARGE_SIZE(__s) (__s) -+#endif -+ -+#define page_ub(__page) ((__page)->bc.page_ub) -+ -+struct mm_struct; -+struct page; -+ -+UB_DECLARE_FUNC(struct user_beancounter *, slab_ub(void *obj)) -+UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj)) -+UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj)) -+ -+UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, int mask)) -+UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order)) -+UB_DECLARE_FUNC(int, ub_slab_charge(void *objp, int flags)) -+UB_DECLARE_VOID_FUNC(ub_slab_uncharge(void *obj)) -+ -+#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\ -+ (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\ -+ sizeof(void *)))) -+ -+#ifdef CONFIG_USER_RESOURCE -+/* Flags without __GFP_UBC must comply with vmalloc */ -+#define ub_vmalloc(size) __vmalloc(size, \ -+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_UBC, PAGE_KERNEL) -+#define ub_vmalloc_node(size, node) __vmalloc_node(size, \ -+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_UBC, PAGE_KERNEL, node) -+#define ub_kmalloc(size, flags) kmalloc(size, ((flags) | __GFP_UBC)) -+extern struct user_beancounter *ub_select_worst(long *); -+ -+/* mm/slab.c needed stuff */ -+#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1) -+#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0) -+#define set_cache_objuse(cachep) do { \ -+ (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \ -+ (cachep)->num - 1) / (cachep)->num; \ -+ if (!OFF_SLAB(cachep)) \ -+ break; \ -+ (cachep)->objuse += ((cachep)->slabp_cache->objuse + \ -+ (cachep)->num - 1) / (cachep)->num; \ -+ } while (0) -+#define init_slab_ubps(cachep, slabp) do { \ -+ if (!((cachep)->flags & SLAB_UBC)) \ -+ break; \ -+ memset(slab_ubcs(cachep, slabp), 0, \ -+ (cachep)->num * sizeof(void *)); \ -+ } while (0) -+#define kmem_obj_memusage(o) (virt_to_cache(o)->objuse) -+#else -+#define ub_vmalloc(size) vmalloc(size) -+#define ub_vmalloc_node(size, node) vmalloc_node(size, node) -+#define ub_kmalloc(size, flags) kmalloc(size, flags) -+#define UB_ALIGN(flags) 1 -+#define UB_EXTRA(flags) 0 -+#define set_cache_objuse(c) do { } while (0) -+#define init_slab_ubps(c, s) do { } while (0) -+#endif -+#endif /* __UB_SLAB_H_ */ -diff -upr linux-2.6.16.orig/include/ub/ub_misc.h linux-2.6.16-026test009/include/ub/ub_misc.h ---- linux-2.6.16.orig/include/ub/ub_misc.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_misc.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,49 @@ -+/* -+ * include/ub/ub_misc.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_MISC_H_ -+#define __UB_MISC_H_ -+ -+#include <ub/ub_decl.h> -+ -+struct tty_struct; -+struct file; -+struct file_lock; -+struct sigqueue; -+ -+UB_DECLARE_FUNC(int, ub_file_charge(struct file *f)) -+UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f)) -+UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard)) -+UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl)) -+UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q, -+ struct user_beancounter *ub)) -+UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q)) -+UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent, -+ struct task_struct *task)) -+UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task)) -+UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty)) -+UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty)) -+ -+#ifdef CONFIG_USER_RESOURCE -+#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0) -+#define set_mm_ub(mm, tsk) do { \ -+ (mm)->mm_ub = get_beancounter(tsk ? \ -+ tsk->task_bc.task_ub : get_exec_ub()); \ -+ } while (0) -+#define put_mm_ub(mm) do { \ -+ put_beancounter((mm)->mm_ub); \ -+ (mm)->mm_ub = NULL; \ -+ } while (0) -+#else -+#define set_flock_charged(fl) do { } while (0) -+#define set_mm_ub(mm, tsk) do { } while (0) -+#define put_mm_ub(mm) do { } while (0) -+#endif -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_net.h linux-2.6.16-026test009/include/ub/ub_net.h ---- linux-2.6.16.orig/include/ub/ub_net.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_net.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,141 @@ -+/* -+ * include/ub/ub_net.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_NET_H_ -+#define __UB_NET_H_ -+ -+/* -+ * UB_NUMXXXSOCK, UB_XXXBUF accounting -+ */ -+ -+#include <ub/ub_decl.h> -+#include <ub/ub_sk.h> -+ -+#define bid2sid(__bufid) \ -+ ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK) -+ -+#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \ -+ ~(SMP_CACHE_BYTES-1))) -+#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE) -+ -+ -+#define IS_TCP_SOCK(__family, __type) \ -+ ((__family) == PF_INET && (__type) == SOCK_STREAM) -+ -+UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type)) -+UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) -+UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk)) -+UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk)) -+UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb)) -+UB_DECLARE_FUNC(int, ub_skb_alloc_bc(struct sk_buff *skb, int gfp_mask)) -+UB_DECLARE_VOID_FUNC(ub_skb_free_bc(struct sk_buff *skb)) -+UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)) -+UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)) -+UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource, -+ unsigned long size)) -+UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, -+ unsigned long size)) -+ -+UB_DECLARE_FUNC(int, ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb)) -+UB_DECLARE_FUNC(int, ub_tcprcvbuf_charge_forced(struct sock *sk, -+ struct sk_buff *skb)) -+UB_DECLARE_FUNC(int, ub_tcpsndbuf_charge(struct sock *sk, struct sk_buff *skb)) -+UB_DECLARE_FUNC(int, ub_tcpsndbuf_charge_forced(struct sock *sk, -+ struct sk_buff *skb)) -+ -+/* Charge size */ -+static inline unsigned long skb_charge_datalen(unsigned long chargesize) -+{ -+#ifdef CONFIG_USER_RESOURCE -+ unsigned long slabsize; -+ -+ chargesize -= sizeof(struct sk_buff); -+ slabsize = 64; -+ do { -+ slabsize <<= 1; -+ } while (slabsize <= chargesize); -+ -+ slabsize >>= 1; -+ return (slabsize - sizeof(struct skb_shared_info)) & -+ ~(SMP_CACHE_BYTES-1); -+#else -+ return 0; -+#endif -+} -+ -+static inline unsigned long skb_charge_size_gen(unsigned long size) -+{ -+#ifdef CONFIG_USER_RESOURCE -+ unsigned int slabsize; -+ -+ size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info); -+ slabsize = 32; /* min size is 64 because of skb_shared_info */ -+ do { -+ slabsize <<= 1; -+ } while (slabsize < size); -+ -+ return slabsize + sizeof(struct sk_buff); -+#else -+ return 0; -+#endif -+ -+} -+ -+static inline unsigned long skb_charge_size_const(unsigned long size) -+{ -+#ifdef CONFIG_USER_RESOURCE -+ unsigned int ret; -+ if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64) -+ ret = 64 + sizeof(struct sk_buff); -+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128) -+ ret = 128 + sizeof(struct sk_buff); -+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256) -+ ret = 256 + sizeof(struct sk_buff); -+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512) -+ ret = 512 + sizeof(struct sk_buff); -+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024) -+ ret = 1024 + sizeof(struct sk_buff); -+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048) -+ ret = 2048 + sizeof(struct sk_buff); -+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096) -+ ret = 4096 + sizeof(struct sk_buff); -+ else -+ ret = skb_charge_size_gen(size); -+ return ret; -+#else -+ return 0; -+#endif -+} -+ -+ -+#define skb_charge_size(__size) \ -+ (__builtin_constant_p(__size) ? \ -+ skb_charge_size_const(__size) : \ -+ skb_charge_size_gen(__size)) -+ -+UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb)) -+UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, -+ struct sock *sk, unsigned long size, int res)) -+ -+/* Poll reserv */ -+UB_DECLARE_FUNC(int, ub_sock_makewres_other(struct sock *sk, unsigned long sz)) -+UB_DECLARE_FUNC(int, ub_sock_makewres_tcp(struct sock *sk, unsigned long size)) -+UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, unsigned long size)) -+UB_DECLARE_FUNC(int, ub_sock_getwres_tcp(struct sock *sk, unsigned long size)) -+UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, unsigned long size, -+ unsigned long ressize)) -+UB_DECLARE_VOID_FUNC(ub_sock_retwres_tcp(struct sock *sk, unsigned long size, -+ unsigned long ressize)) -+UB_DECLARE_VOID_FUNC(ub_sock_sndqueueadd_other(struct sock *sk, -+ unsigned long size)) -+UB_DECLARE_VOID_FUNC(ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz)) -+UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk)) -+ -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_orphan.h linux-2.6.16-026test009/include/ub/ub_orphan.h ---- linux-2.6.16.orig/include/ub/ub_orphan.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_orphan.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,56 @@ -+/* -+ * include/ub/ub_orphan.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_ORPHAN_H_ -+#define __UB_ORPHAN_H_ -+ -+#include <net/tcp.h> -+ -+#include "ub/beancounter.h" -+#include "ub/ub_net.h" -+ -+ -+static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk) -+{ -+#ifdef CONFIG_USER_RESOURCE -+ if (sock_has_ubc(sk)) -+ return &sock_bc(sk)->ub->ub_orphan_count; -+#endif -+ return sk->sk_prot->orphan_count; -+} -+ -+static inline void ub_inc_orphan_count(struct sock *sk) -+{ -+ atomic_inc(__ub_get_orphan_count_ptr(sk)); -+} -+ -+static inline void ub_dec_orphan_count(struct sock *sk) -+{ -+ atomic_dec(__ub_get_orphan_count_ptr(sk)); -+} -+ -+static inline int ub_get_orphan_count(struct sock *sk) -+{ -+ return atomic_read(__ub_get_orphan_count_ptr(sk)); -+} -+ -+extern int __ub_too_many_orphans(struct sock *sk, int count); -+static inline int ub_too_many_orphans(struct sock *sk, int count) -+{ -+#ifdef CONFIG_USER_RESOURCE -+ if (__ub_too_many_orphans(sk, count)) -+ return 1; -+#endif -+ return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans || -+ (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && -+ atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])); -+} -+ -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_page.h linux-2.6.16-026test009/include/ub/ub_page.h ---- linux-2.6.16.orig/include/ub/ub_page.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_page.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,48 @@ -+/* -+ * include/ub/ub_page.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_PAGE_H_ -+#define __UB_PAGE_H_ -+ -+#include <linux/config.h> -+ -+/* -+ * Page_beancounters -+ */ -+ -+struct page; -+struct user_beancounter; -+ -+#define PB_MAGIC 0x62700001UL -+ -+struct page_beancounter { -+ unsigned long pb_magic; -+ struct page *page; -+ struct user_beancounter *ub; -+ struct page_beancounter *next_hash; -+ unsigned refcount; -+ struct list_head page_list; -+}; -+ -+#define PB_REFCOUNT_BITS 24 -+#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS) -+#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS)) -+#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS)) -+#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1)) -+#define PB_COUNT_INC(c) ((c)++) -+#define PB_COUNT_DEC(c) ((c)--) -+#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c)) -+ -+#define page_pbc(__page) ((__page)->bc.page_pb) -+ -+struct address_space; -+extern int is_shmem_mapping(struct address_space *); -+ -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_sk.h linux-2.6.16-026test009/include/ub/ub_sk.h ---- linux-2.6.16.orig/include/ub/ub_sk.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_sk.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,43 @@ -+/* -+ * include/ub/ub_sk.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_SK_H_ -+#define __UB_SK_H_ -+ -+#include <linux/config.h> -+#include <ub/ub_task.h> -+ -+struct sock; -+struct sk_buff; -+ -+struct skb_beancounter { -+ struct user_beancounter *ub; -+ unsigned long charged:27, resource:5; -+}; -+ -+struct sock_beancounter { -+ /* -+ * already charged for future sends, to make poll work; -+ * changes are protected by bc spinlock, read is under socket -+ * semaphore for sends and unprotected in poll -+ */ -+ unsigned long poll_reserv; -+ unsigned long ub_waitspc; /* space waiting for */ -+ unsigned long ub_wcharged; -+ struct list_head ub_sock_list; -+ struct user_beancounter *ub; -+}; -+ -+#define sock_bc(__sk) (&(__sk)->sk_bc) -+#define skb_bc(__skb) (&(__skb)->skb_bc) -+#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc)) -+#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL) -+ -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_stat.h linux-2.6.16-026test009/include/ub/ub_stat.h ---- linux-2.6.16.orig/include/ub/ub_stat.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_stat.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,70 @@ -+/* -+ * include/ub/ub_stat.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_STAT_H_ -+#define __UB_STAT_H_ -+ -+/* sys_ubstat commands list */ -+#define UBSTAT_READ_ONE 0x010000 -+#define UBSTAT_READ_ALL 0x020000 -+#define UBSTAT_READ_FULL 0x030000 -+#define UBSTAT_UBLIST 0x040000 -+#define UBSTAT_UBPARMNUM 0x050000 -+#define UBSTAT_GETTIME 0x060000 -+ -+#define UBSTAT_CMD(func) ((func) & 0xF0000) -+#define UBSTAT_PARMID(func) ((func) & 0x0FFFF) -+ -+#define TIME_MAX_SEC (LONG_MAX / HZ) -+#define TIME_MAX_JIF (TIME_MAX_SEC * HZ) -+ -+typedef unsigned long ubstattime_t; -+ -+typedef struct { -+ ubstattime_t start_time; -+ ubstattime_t end_time; -+ ubstattime_t cur_time; -+} ubgettime_t; -+ -+typedef struct { -+ long maxinterval; -+ int signum; -+} ubnotifrq_t; -+ -+typedef struct { -+ unsigned long maxheld; -+ unsigned long failcnt; -+} ubstatparm_t; -+ -+typedef struct { -+ unsigned long barrier; -+ unsigned long limit; -+ unsigned long held; -+ unsigned long maxheld; -+ unsigned long minheld; -+ unsigned long failcnt; -+ unsigned long __unused1; -+ unsigned long __unused2; -+} ubstatparmf_t; -+ -+typedef struct { -+ ubstattime_t start_time; -+ ubstattime_t end_time; -+ ubstatparmf_t param[0]; -+} ubstatfull_t; -+ -+#ifdef __KERNEL__ -+struct ub_stat_notify { -+ struct list_head list; -+ struct task_struct *task; -+ int signum; -+}; -+#endif -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_task.h linux-2.6.16-026test009/include/ub/ub_task.h ---- linux-2.6.16.orig/include/ub/ub_task.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_task.h 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,49 @@ -+/* -+ * include/ub/ub_task.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_TASK_H_ -+#define __UB_TASK_H_ -+ -+#include <linux/config.h> -+ -+struct user_beancounter; -+ -+ -+#ifdef CONFIG_USER_RESOURCE -+ -+struct task_beancounter { -+ struct user_beancounter *exec_ub; -+ struct user_beancounter *task_ub; -+ struct user_beancounter *fork_sub; -+ void *task_fnode, *task_freserv; -+ unsigned long oom_generation; -+ unsigned long task_data[4]; -+}; -+ -+#define get_exec_ub() (current->task_bc.exec_ub) -+#define get_task_ub(__task) ((__task)->task_bc.task_ub) -+#define set_exec_ub(__newub) \ -+({ \ -+ struct user_beancounter *old; \ -+ struct task_beancounter *tbc; \ -+ tbc = ¤t->task_bc; \ -+ old = tbc->exec_ub; \ -+ tbc->exec_ub = __newub; \ -+ old; \ -+}) -+ -+#else /* CONFIG_USER_RESOURCE */ -+ -+#define get_exec_ub() (NULL) -+#define get_task_ub(task) (NULL) -+#define set_exec_ub(__ub) (NULL) -+ -+#endif /* CONFIG_USER_RESOURCE */ -+#endif /* __UB_TASK_H_ */ -diff -upr linux-2.6.16.orig/include/ub/ub_tcp.h linux-2.6.16-026test009/include/ub/ub_tcp.h ---- linux-2.6.16.orig/include/ub/ub_tcp.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_tcp.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,79 @@ -+/* -+ * include/ub/ub_tcp.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_TCP_H_ -+#define __UB_TCP_H_ -+ -+/* -+ * UB_NUMXXXSOCK, UB_XXXBUF accounting -+ */ -+ -+#include <ub/ub_sk.h> -+#include <ub/beancounter.h> -+ -+static inline void ub_tcp_update_maxadvmss(struct sock *sk) -+{ -+#ifdef CONFIG_USER_RESOURCE -+ if (!sock_has_ubc(sk)) -+ return; -+ if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss) -+ return; -+ -+ sock_bc(sk)->ub->ub_maxadvmss = -+ skb_charge_size(MAX_HEADER + sizeof(struct iphdr) -+ + sizeof(struct tcphdr) + tcp_sk(sk)->advmss); -+#endif -+} -+ -+static inline int ub_tcp_rmem_allows_expand(struct sock *sk) -+{ -+ if (tcp_memory_pressure) -+ return 0; -+#ifdef CONFIG_USER_RESOURCE -+ if (sock_has_ubc(sk)) { -+ struct user_beancounter *ub; -+ -+ ub = sock_bc(sk)->ub; -+ if (ub->ub_rmem_pressure == UB_RMEM_EXPAND) -+ return 1; -+ if (ub->ub_rmem_pressure == UB_RMEM_SHRINK) -+ return 0; -+ return sk->sk_rcvbuf <= ub->ub_rmem_thres; -+ } -+#endif -+ return 1; -+} -+ -+static inline int ub_tcp_memory_pressure(struct sock *sk) -+{ -+ if (tcp_memory_pressure) -+ return 1; -+#ifdef CONFIG_USER_RESOURCE -+ if (sock_has_ubc(sk)) -+ return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND; -+#endif -+ return 0; -+} -+ -+static inline int ub_tcp_shrink_rcvbuf(struct sock *sk) -+{ -+ if (tcp_memory_pressure) -+ return 1; -+#ifdef CONFIG_USER_RESOURCE -+ if (sock_has_ubc(sk)) -+ return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK; -+#endif -+ return 0; -+} -+ -+UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk)) -+UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk)) -+ -+#endif -diff -upr linux-2.6.16.orig/include/ub/ub_vmpages.h linux-2.6.16-026test009/include/ub/ub_vmpages.h ---- linux-2.6.16.orig/include/ub/ub_vmpages.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/include/ub/ub_vmpages.h 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,167 @@ -+/* -+ * include/ub/ub_vmpages.h -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#ifndef __UB_PAGES_H_ -+#define __UB_PAGES_H_ -+ -+#include <linux/linkage.h> -+#include <linux/config.h> -+#include <ub/beancounter.h> -+#include <ub/ub_decl.h> -+ -+/* -+ * Check whether vma has private or copy-on-write mapping. -+ * Should match checks in ub_protected_charge(). -+ */ -+#define VM_UB_PRIVATE(__flags, __file) \ -+ ( ((__flags) & VM_WRITE) ? \ -+ (__file) == NULL || !((__flags) & VM_SHARED) : \ -+ 0 \ -+ ) -+ -+/* Mprotect charging result */ -+#define PRIVVM_ERROR -1 -+#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */ -+#define PRIVVM_TO_PRIVATE 1 -+#define PRIVVM_TO_SHARED 2 -+ -+UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm, -+ unsigned long size, -+ unsigned long newflags, -+ struct vm_area_struct *vma)) -+ -+UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm, -+ struct vm_area_struct *vma, -+ unsigned long num)) -+#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1) -+UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm, -+ struct vm_area_struct *vma, -+ unsigned long num)) -+#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1) -+ -+UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm, -+ long sz)) -+ -+UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm, -+ unsigned long size, -+ unsigned vm_flags, -+ struct file *vm_file, -+ int strict)) -+UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm, -+ unsigned long size, -+ unsigned vm_flags, -+ struct file *vm_file)) -+ -+struct shmem_inode_info; -+UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i, -+ unsigned long sz)) -+UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i, -+ unsigned long sz)) -+UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi)) -+UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi, -+ unsigned long size)) -+#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1) -+ -+#ifdef CONFIG_USER_RESOURCE -+#define shmi_ub_set(shi, ub) do { \ -+ (shi)->shmi_ub = get_beancounter(ub); \ -+ } while (0) -+#define shmi_ub_put(shi) do { \ -+ put_beancounter((shi)->shmi_ub); \ -+ (shi)->shmi_ub = NULL; \ -+ } while (0) -+#else -+#define shmi_ub_set(shi, ub) do { } while (0) -+#define shmi_ub_put(shi) do { } while (0) -+#endif -+ -+UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm, -+ unsigned long size)) -+UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm, -+ unsigned long size)) -+UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi, -+ unsigned long size)) -+UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi, -+ unsigned long size)) -+ -+UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma, -+ unsigned long addr, unsigned long end)) -+UB_DECLARE_VOID_FUNC(warn_bad_rss(struct vm_area_struct *vma, -+ unsigned long freed)) -+#define pages_in_vma(vma) (pages_in_vma_range(vma, \ -+ vma->vm_start, vma->vm_end)) -+ -+#define UB_PAGE_WEIGHT_SHIFT 24 -+#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) -+ -+struct page_beancounter; -+#define PBC_COPY_SAME ((struct page_beancounter *) 1) -+ -+/* Mprotect charging result */ -+#define PRIVVM_ERROR -1 -+#define PRIVVM_NO_CHARGE 0 -+#define PRIVVM_TO_PRIVATE 1 -+#define PRIVVM_TO_SHARED 2 -+ -+extern void fastcall __ub_update_physpages(struct user_beancounter *ub); -+extern void fastcall __ub_update_oomguarpages(struct user_beancounter *ub); -+extern void fastcall __ub_update_privvm(struct user_beancounter *ub); -+ -+#ifdef CONFIG_USER_RSS_ACCOUNTING -+#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) -+#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) -+#else -+#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} -+#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { } -+#endif -+ -+PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc)) -+PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num)) -+PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc)) -+PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page, -+ struct mm_struct *mm, -+ struct page_beancounter **pbc)) -+PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page, -+ struct mm_struct *mm, -+ struct page_beancounter **pbc)) -+PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb)) -+PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb)) -+PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, -+ struct mm_struct *mm)) -+ -+PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page)) -+#endif -+ -+#ifdef CONFIG_USER_SWAP_ACCOUNTING -+#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) -+#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) -+#else -+#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} -+#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { } -+#endif -+ -+struct swap_info_struct; -+SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n)) -+SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si)) -+SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n, -+ struct user_beancounter *ub)) -+SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n)) -+ -+#ifdef CONFIG_USER_RESOURCE -+#define ub_unmap_inc(mm) do { \ -+ (mm)->mm_ub->ub_stat[smp_processor_id()].unmap++; \ -+ } while (0) -+#define ub_swapin_inc(mm) do { \ -+ (mm)->mm_ub->ub_stat[smp_processor_id()].swapin++; \ -+ } while (0) -+#else -+#define ub_unmap_inc(mm) do { } while (0) -+#define ub_swapin_inc(mm) do { } while (0) -+#endif -diff -upr linux-2.6.16.orig/init/calibrate.c linux-2.6.16-026test009/init/calibrate.c ---- linux-2.6.16.orig/init/calibrate.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/init/calibrate.c 2006-04-19 15:02:12.000000000 +0400 -@@ -7,6 +7,7 @@ - #include <linux/sched.h> - #include <linux/delay.h> - #include <linux/init.h> -+#include <linux/module.h> - - #include <asm/timex.h> - -@@ -105,6 +106,60 @@ static unsigned long __devinit calibrate - static unsigned long __devinit calibrate_delay_direct(void) {return 0;} - #endif - -+unsigned long cycles_per_jiffy, cycles_per_clock; -+ -+static __devinit void calibrate_cycles(void) -+{ -+ unsigned long ticks; -+ cycles_t time; -+ -+ ticks = jiffies; -+ while (ticks == jiffies) -+ /* nothing */; -+ time = get_cycles(); -+ ticks = jiffies; -+ while (ticks == jiffies) -+ /* nothing */; -+ -+ time = get_cycles() - time; -+ cycles_per_jiffy = time; -+ if ((time >> 32) != 0) { -+ printk("CPU too fast! timings are incorrect\n"); -+ cycles_per_jiffy = -1; -+ } -+} -+ -+EXPORT_SYMBOL(cycles_per_jiffy); -+EXPORT_SYMBOL(cycles_per_clock); -+ -+static __devinit void calc_cycles_per_jiffy(void) -+{ -+#if defined(__i386__) -+ extern unsigned long fast_gettimeoffset_quotient; -+ unsigned long low, high; -+ -+ if (fast_gettimeoffset_quotient != 0) { -+ __asm__("divl %2" -+ :"=a" (low), "=d" (high) -+ :"r" (fast_gettimeoffset_quotient), -+ "0" (0), "1" (1000000/HZ)); -+ -+ cycles_per_jiffy = low; -+ } -+#endif -+ if (cycles_per_jiffy == 0) -+ calibrate_cycles(); -+ -+ if (cycles_per_jiffy == 0) { -+ printk(KERN_WARNING "Cycles are stuck! " -+ "Some VPS statistics will not be available."); -+ /* to prevent division by zero in cycles_to_(clocks|jiffies) */ -+ cycles_per_jiffy = 1; -+ cycles_per_clock = 1; -+ } else -+ cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC); -+} -+ - /* - * This is the number of bits of precision for the loops_per_jiffy. Each - * bit takes on average 1.5/HZ seconds. This (like the original) is a little -@@ -170,4 +225,5 @@ void __devinit calibrate_delay(void) - loops_per_jiffy); - } - -+ calc_cycles_per_jiffy(); - } -diff -upr linux-2.6.16.orig/init/main.c linux-2.6.16-026test009/init/main.c ---- linux-2.6.16.orig/init/main.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/init/main.c 2006-04-19 15:02:12.000000000 +0400 -@@ -48,6 +48,8 @@ - #include <linux/mempolicy.h> - #include <linux/key.h> - -+#include <ub/beancounter.h> -+ - #include <asm/io.h> - #include <asm/bugs.h> - #include <asm/setup.h> -@@ -104,6 +106,20 @@ extern void tc_init(void); - enum system_states system_state; - EXPORT_SYMBOL(system_state); - -+#ifdef CONFIG_VE -+extern void init_ve_system(void); -+extern void prepare_ve0_process(struct task_struct *tsk); -+extern void prepare_ve0_proc_root(void); -+extern void prepare_ve0_sysctl(void); -+extern void prepare_ve0_loopback(void); -+#else -+#define init_ve_system() do { } while (0) -+#define prepare_ve0_process(tsk) do { } while (0) -+#define prepare_ve0_proc_root() do { } while (0) -+#define prepare_ve0_sysctl() do { } while (0) -+#define prepare_ve0_loopback() do { } while (0) -+#endif -+ - /* - * Boot command-line arguments - */ -@@ -447,6 +463,10 @@ asmlinkage void __init start_kernel(void - * enable them - */ - lock_kernel(); -+ /* -+ * Prepare ub0 to account early allocations if any -+ */ -+ ub_init_ub0(); - page_address_init(); - printk(KERN_NOTICE); - printk(linux_banner); -@@ -459,6 +479,8 @@ asmlinkage void __init start_kernel(void - */ - smp_prepare_boot_cpu(); - -+ prepare_ve0_process(&init_task); -+ - /* - * Set up the scheduler prior starting any interrupts (such as the - * timer interrupt). Full topology setup happens at smp_init() -@@ -524,6 +546,7 @@ asmlinkage void __init start_kernel(void - #endif - fork_init(num_physpages); - proc_caches_init(); -+ ub_init_cache(num_physpages); - buffer_init(); - unnamed_dev_init(); - key_init(); -@@ -534,7 +557,10 @@ asmlinkage void __init start_kernel(void - /* rootfs populating might need page-writeback */ - page_writeback_init(); - #ifdef CONFIG_PROC_FS -+ prepare_ve0_proc_root(); -+ prepare_ve0_sysctl(); - proc_root_init(); -+ ub_init_proc(); - #endif - cpuset_init(); - -@@ -542,6 +568,10 @@ asmlinkage void __init start_kernel(void - - acpi_early_init(); /* before LAPIC and SMP init */ - -+#ifdef CONFIG_USER_RESOURCE -+ ub_init_pbc(); -+#endif -+ - /* Do the rest non-__init'ed, we're now alive */ - rest_init(); - } -@@ -603,6 +633,9 @@ static void __init do_initcalls(void) - */ - static void __init do_basic_setup(void) - { -+ prepare_ve0_loopback(); -+ init_ve_system(); -+ - /* drivers will send hotplug events */ - init_workqueues(); - usermodehelper_init(); -diff -upr linux-2.6.16.orig/init/version.c linux-2.6.16-026test009/init/version.c ---- linux-2.6.16.orig/init/version.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/init/version.c 2006-04-19 15:02:12.000000000 +0400 -@@ -28,6 +28,12 @@ struct new_utsname system_utsname = { - - EXPORT_SYMBOL(system_utsname); - -+struct new_utsname virt_utsname = { -+ /* we need only this field */ -+ .release = UTS_RELEASE, -+}; -+EXPORT_SYMBOL(virt_utsname); -+ - const char linux_banner[] = - "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; -diff -upr linux-2.6.16.orig/ipc/mqueue.c linux-2.6.16-026test009/ipc/mqueue.c ---- linux-2.6.16.orig/ipc/mqueue.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/ipc/mqueue.c 2006-04-19 15:02:11.000000000 +0400 -@@ -639,7 +639,8 @@ static int oflag2acc[O_ACCMODE] = { MAY_ - return ERR_PTR(-EINVAL); - } - -- if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL)) { -+ if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], -+ NULL, NULL)) { - dput(dentry); - mntput(mqueue_mnt); - return ERR_PTR(-EACCES); -diff -upr linux-2.6.16.orig/ipc/msg.c linux-2.6.16-026test009/ipc/msg.c ---- linux-2.6.16.orig/ipc/msg.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/ipc/msg.c 2006-04-19 15:02:12.000000000 +0400 -@@ -88,6 +88,45 @@ void __init msg_init (void) - sysvipc_msg_proc_show); - } - -+#ifdef CONFIG_VE -+void __init prepare_msg(void) -+{ -+ get_ve0()->_msg_ids = &msg_ids; -+ get_ve0()->_msg_ctlmax = msg_ctlmax; -+ get_ve0()->_msg_ctlmnb = msg_ctlmnb; -+ get_ve0()->_msg_ctlmni = msg_ctlmni; -+} -+ -+#define msg_ids (*(get_exec_env()->_msg_ids)) -+#define msg_ctlmax (get_exec_env()->_msg_ctlmax) -+#define msg_ctlmnb (get_exec_env()->_msg_ctlmnb) -+#define msg_ctlmni (get_exec_env()->_msg_ctlmni) -+ -+void init_ve_ipc_msg(void) -+{ -+ msg_ctlmax = MSGMAX; -+ msg_ctlmnb = MSGMNB; -+ msg_ctlmni = MSGMNI; -+ ipc_init_ids(&msg_ids, MSGMNI); -+} -+ -+void cleanup_ve_ipc_msg(void) -+{ -+ int i; -+ struct msg_queue *msq; -+ -+ down(&msg_ids.sem); -+ for (i = 0; i <= msg_ids.max_id; i++) { -+ msq = msg_lock(i); -+ if (msq == NULL) -+ continue; -+ -+ freeque(msq, i); -+ } -+ up(&msg_ids.sem); -+} -+#endif -+ - static int newque (key_t key, int msgflg) - { - int id; -@@ -108,7 +147,7 @@ static int newque (key_t key, int msgflg - return retval; - } - -- id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni); -+ id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni, -1); - if(id == -1) { - security_msg_queue_free(msq); - ipc_rcu_putref(msq); -@@ -450,7 +489,7 @@ asmlinkage long sys_msgctl (int msqid, i - ipcp = &msq->q_perm; - err = -EPERM; - if (current->euid != ipcp->cuid && -- current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) -+ current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) - /* We _could_ check for CAP_CHOWN above, but we don't */ - goto out_unlock_up; - -@@ -540,7 +579,7 @@ static inline int pipelined_send(struct - msr->r_msg = ERR_PTR(-E2BIG); - } else { - msr->r_msg = NULL; -- msq->q_lrpid = msr->r_tsk->pid; -+ msq->q_lrpid = virt_pid(msr->r_tsk); - msq->q_rtime = get_seconds(); - wake_up_process(msr->r_tsk); - smp_mb(); -@@ -622,7 +661,7 @@ asmlinkage long sys_msgsnd (int msqid, s - } - } - -- msq->q_lspid = current->tgid; -+ msq->q_lspid = virt_tgid(current); - msq->q_stime = get_seconds(); - - if(!pipelined_send(msq,msg)) { -@@ -718,7 +757,7 @@ asmlinkage long sys_msgrcv (int msqid, s - list_del(&msg->m_list); - msq->q_qnum--; - msq->q_rtime = get_seconds(); -- msq->q_lrpid = current->tgid; -+ msq->q_lrpid = virt_tgid(current); - msq->q_cbytes -= msg->m_ts; - atomic_sub(msg->m_ts,&msg_bytes); - atomic_dec(&msg_hdrs); -@@ -833,3 +872,27 @@ static int sysvipc_msg_proc_show(struct - msq->q_ctime); - } - #endif -+ -+#ifdef CONFIG_VZ_CHECKPOINT_MODULE -+#include <linux/module.h> -+ -+int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg) -+{ -+ int i; -+ int err = 0; -+ struct msg_queue * msq; -+ -+ down(&msg_ids.sem); -+ for(i = 0; i <= msg_ids.max_id; i++) { -+ if ((msq = msg_lock(i)) == NULL) -+ continue; -+ err = func(msg_buildid(i,msq->q_perm.seq), msq, arg); -+ msg_unlock(msq); -+ if (err) -+ break; -+ } -+ up(&msg_ids.sem); -+ return err; -+} -+EXPORT_SYMBOL_GPL(sysvipc_walk_msg); -+#endif -diff -upr linux-2.6.16.orig/ipc/msgutil.c linux-2.6.16-026test009/ipc/msgutil.c ---- linux-2.6.16.orig/ipc/msgutil.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/ipc/msgutil.c 2006-04-19 15:02:11.000000000 +0400 -@@ -17,6 +17,8 @@ - - #include "util.h" - -+#include <ub/ub_mem.h> -+ - struct msg_msgseg { - struct msg_msgseg* next; - /* the next part of the message follows immediately */ -@@ -36,7 +38,7 @@ struct msg_msg *load_msg(const void __us - if (alen > DATALEN_MSG) - alen = DATALEN_MSG; - -- msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL); -+ msg = (struct msg_msg *)ub_kmalloc(sizeof(*msg) + alen, GFP_KERNEL); - if (msg == NULL) - return ERR_PTR(-ENOMEM); - -@@ -56,7 +58,7 @@ struct msg_msg *load_msg(const void __us - alen = len; - if (alen > DATALEN_SEG) - alen = DATALEN_SEG; -- seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen, -+ seg = (struct msg_msgseg *)ub_kmalloc(sizeof(*seg) + alen, - GFP_KERNEL); - if (seg == NULL) { - err = -ENOMEM; -diff -upr linux-2.6.16.orig/ipc/sem.c linux-2.6.16-026test009/ipc/sem.c ---- linux-2.6.16.orig/ipc/sem.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/ipc/sem.c 2006-04-19 15:02:12.000000000 +0400 -@@ -78,6 +78,7 @@ - #include <asm/uaccess.h> - #include "util.h" - -+#include <ub/ub_mem.h> - - #define sem_lock(id) ((struct sem_array*)ipc_lock(&sem_ids,id)) - #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) -@@ -88,7 +89,7 @@ - ipc_buildid(&sem_ids, id, seq) - static struct ipc_ids sem_ids; - --static int newary (key_t, int, int); -+static int newary (key_t, int, int, int); - static void freeary (struct sem_array *sma, int id); - #ifdef CONFIG_PROC_FS - static int sysvipc_sem_proc_show(struct seq_file *s, void *it); -@@ -124,6 +125,48 @@ void __init sem_init (void) - sysvipc_sem_proc_show); - } - -+#ifdef CONFIG_VE -+void __init prepare_sem(void) -+{ -+ get_ve0()->_sem_ids = &sem_ids; -+ get_ve0()->_used_sems = used_sems; -+ get_ve0()->_sem_ctls[0] = sem_ctls[0]; -+ get_ve0()->_sem_ctls[1] = sem_ctls[1]; -+ get_ve0()->_sem_ctls[2] = sem_ctls[2]; -+ get_ve0()->_sem_ctls[3] = sem_ctls[3]; -+} -+ -+#define sem_ids (*(get_exec_env()->_sem_ids)) -+#define used_sems (get_exec_env()->_used_sems) -+#define sem_ctls (get_exec_env()->_sem_ctls) -+ -+void init_ve_ipc_sem(void) -+{ -+ used_sems = 0; -+ sem_ctls[0] = SEMMSL; -+ sem_ctls[1] = SEMMNS; -+ sem_ctls[2] = SEMOPM; -+ sem_ctls[3] = SEMMNI; -+ ipc_init_ids(&sem_ids, SEMMNI); -+} -+ -+void cleanup_ve_ipc_sem(void) -+{ -+ int i; -+ struct sem_array *sma; -+ -+ down(&sem_ids.sem); -+ for (i = 0; i <= sem_ids.max_id; i++) { -+ sma = sem_lock(i); -+ if (sma == NULL) -+ continue; -+ -+ freeary(sma, i); -+ } -+ up(&sem_ids.sem); -+} -+#endif -+ - /* - * Lockless wakeup algorithm: - * Without the check/retry algorithm a lockless wakeup is possible: -@@ -158,7 +201,7 @@ void __init sem_init (void) - */ - #define IN_WAKEUP 1 - --static int newary (key_t key, int nsems, int semflg) -+static int newary (key_t key, int semid, int nsems, int semflg) - { - int id; - int retval; -@@ -187,7 +230,7 @@ static int newary (key_t key, int nsems, - return retval; - } - -- id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni); -+ id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni, semid); - if(id == -1) { - security_sem_free(sma); - ipc_rcu_putref(sma); -@@ -217,12 +260,12 @@ asmlinkage long sys_semget (key_t key, i - down(&sem_ids.sem); - - if (key == IPC_PRIVATE) { -- err = newary(key, nsems, semflg); -+ err = newary(key, -1, nsems, semflg); - } else if ((id = ipc_findkey(&sem_ids, key)) == -1) { /* key not used */ - if (!(semflg & IPC_CREAT)) - err = -ENOENT; - else -- err = newary(key, nsems, semflg); -+ err = newary(key, -1, nsems, semflg); - } else if (semflg & IPC_CREAT && semflg & IPC_EXCL) { - err = -EEXIST; - } else { -@@ -743,7 +786,7 @@ static int semctl_main(int semid, int se - for (un = sma->undo; un; un = un->id_next) - un->semadj[semnum] = 0; - curr->semval = val; -- curr->sempid = current->tgid; -+ curr->sempid = virt_tgid(current); - sma->sem_ctime = get_seconds(); - /* maybe some queued-up processes were waiting for this */ - update_queue(sma); -@@ -823,7 +866,7 @@ static int semctl_down(int semid, int se - ipcp = &sma->sem_perm; - - if (current->euid != ipcp->cuid && -- current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) { -+ current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) { - err=-EPERM; - goto out_unlock; - } -@@ -944,7 +987,8 @@ static inline int get_undo_list(struct s - undo_list = current->sysvsem.undo_list; - if (!undo_list) { - size = sizeof(struct sem_undo_list); -- undo_list = (struct sem_undo_list *) kmalloc(size, GFP_KERNEL); -+ undo_list = (struct sem_undo_list *) ub_kmalloc(size, -+ GFP_KERNEL); - if (undo_list == NULL) - return -ENOMEM; - memset(undo_list, 0, size); -@@ -1008,7 +1052,8 @@ static struct sem_undo *find_undo(int se - ipc_rcu_getref(sma); - sem_unlock(sma); - -- new = (struct sem_undo *) kmalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); -+ new = (struct sem_undo *) ub_kmalloc(sizeof(struct sem_undo) + -+ sizeof(short)*nsems, GFP_KERNEL); - if (!new) { - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); -@@ -1066,7 +1111,7 @@ asmlinkage long sys_semtimedop(int semid - if (nsops > sc_semopm) - return -E2BIG; - if(nsops > SEMOPM_FAST) { -- sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); -+ sops = ub_kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); - if(sops==NULL) - return -ENOMEM; - } -@@ -1150,7 +1195,7 @@ retry_undos: - queue.sops = sops; - queue.nsops = nsops; - queue.undo = un; -- queue.pid = current->tgid; -+ queue.pid = virt_tgid(current); - queue.id = semid; - queue.alter = alter; - if (alter) -@@ -1320,7 +1365,7 @@ found: - sem->semval = 0; - if (sem->semval > SEMVMX) - sem->semval = SEMVMX; -- sem->sempid = current->tgid; -+ sem->sempid = virt_tgid(current); - } - } - sma->sem_otime = get_seconds(); -@@ -1351,3 +1396,48 @@ static int sysvipc_sem_proc_show(struct - sma->sem_ctime); - } - #endif -+ -+#ifdef CONFIG_VZ_CHECKPOINT_MODULE -+#include <linux/module.h> -+ -+int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg) -+{ -+ int err = 0; -+ struct sem_array *sma; -+ -+ down(&sem_ids.sem); -+ sma = sem_lock(semid); -+ if (!sma) { -+ err = newary(key, semid, size, semflg); -+ if (err >= 0) -+ sma = sem_lock(semid); -+ } -+ if (sma) -+ sem_unlock(sma); -+ up(&sem_ids.sem); -+ -+ return err > 0 ? 0 : err; -+} -+EXPORT_SYMBOL_GPL(sysvipc_setup_sem); -+ -+int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg) -+{ -+ int i; -+ int err = 0; -+ struct sem_array *sma; -+ -+ down(&sem_ids.sem); -+ for (i = 0; i <= sem_ids.max_id; i++) { -+ if ((sma = sem_lock(i)) == NULL) -+ continue; -+ err = func(sem_buildid(i,sma->sem_perm.seq), sma, arg); -+ sem_unlock(sma); -+ if (err) -+ break; -+ } -+ up(&sem_ids.sem); -+ return err; -+} -+EXPORT_SYMBOL_GPL(sysvipc_walk_sem); -+EXPORT_SYMBOL_GPL(exit_sem); -+#endif -diff -upr linux-2.6.16.orig/ipc/shm.c linux-2.6.16-026test009/ipc/shm.c ---- linux-2.6.16.orig/ipc/shm.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/ipc/shm.c 2006-04-19 15:02:12.000000000 +0400 -@@ -30,9 +30,13 @@ - #include <linux/capability.h> - #include <linux/ptrace.h> - #include <linux/seq_file.h> -+#include <linux/shmem_fs.h> - - #include <asm/uaccess.h> - -+#include <ub/beancounter.h> -+#include <ub/ub_vmpages.h> -+ - #include "util.h" - - static struct file_operations shm_file_operations; -@@ -46,9 +50,11 @@ static struct ipc_ids shm_ids; - #define shm_buildid(id, seq) \ - ipc_buildid(&shm_ids, id, seq) - --static int newseg (key_t key, int shmflg, size_t size); -+static int newseg (key_t key, int shmid, int shmflg, size_t size); - static void shm_open (struct vm_area_struct *shmd); - static void shm_close (struct vm_area_struct *shmd); -+static void shm_destroy (struct shmid_kernel *shmd); -+static void do_shm_rmid(struct shmid_kernel *shp); - #ifdef CONFIG_PROC_FS - static int sysvipc_shm_proc_show(struct seq_file *s, void *it); - #endif -@@ -68,6 +74,68 @@ void __init shm_init (void) - sysvipc_shm_proc_show); - } - -+#ifdef CONFIG_VE -+void __init prepare_shm(void) -+{ -+ get_ve0()->_shm_ids = &shm_ids; -+ get_ve0()->_shm_ctlmax = shm_ctlmax; -+ get_ve0()->_shm_ctlall = shm_ctlall; -+ get_ve0()->_shm_ctlmni = shm_ctlmni; -+ get_ve0()->_shm_tot = shm_tot; -+} -+ -+#define shm_ids (*(get_exec_env()->_shm_ids)) -+#define shm_ctlmax (get_exec_env()->_shm_ctlmax) -+#define shm_ctlall (get_exec_env()->_shm_ctlall) -+#define shm_ctlmni (get_exec_env()->_shm_ctlmni) -+#define shm_total (get_exec_env()->_shm_tot) -+ -+void init_ve_ipc_shm(void) -+{ -+ shm_ctlmax = SHMMAX; -+ shm_ctlall = SHMALL; -+ shm_ctlmni = SHMMNI; -+ shm_total = 0; -+ ipc_init_ids(&shm_ids, 1); -+} -+ -+void cleanup_ve_ipc_shm(void) -+{ -+ int i; -+ struct shmid_kernel *shp; -+ -+ down(&shm_ids.sem); -+ for (i = 0; i <= shm_ids.max_id; i++) { -+ shp = shm_lock(i); -+ if (shp == NULL) -+ continue; -+ -+ do_shm_rmid(shp); -+ } -+ up(&shm_ids.sem); -+} -+#define sb_ve(sb) VE_OWNER_FSTYPE(sb->s_type) -+#define shm_total_sb(sb) (&sb_ve(sb)->_shm_tot) -+#define shm_lock_sb(id, sb) ((struct shmid_kernel *) \ -+ ipc_lock(sb_ve(sb)->_shm_ids, id)) -+#else -+/* renamed since there is a struct field named shm_tot */ -+#define shm_total shm_tot -+#define shm_total_sb(sb) (&shm_tot) -+#define shm_lock_sb(id, sb) shm_lock(id) -+#endif -+ -+static void do_shm_rmid(struct shmid_kernel *shp) -+{ -+ if (shp->shm_nattch){ -+ shp->shm_perm.mode |= SHM_DEST; -+ /* Do not find it any more */ -+ shp->shm_perm.key = IPC_PRIVATE; -+ shm_unlock(shp); -+ } else -+ shm_destroy (shp); -+} -+ - static inline int shm_checkid(struct shmid_kernel *s, int id) - { - if (ipc_checkid(&shm_ids,&s->shm_perm,id)) -@@ -75,25 +143,25 @@ static inline int shm_checkid(struct shm - return 0; - } - --static inline struct shmid_kernel *shm_rmid(int id) -+static inline struct shmid_kernel *shm_rmid(struct ipc_ids *ids, int id) - { -- return (struct shmid_kernel *)ipc_rmid(&shm_ids,id); -+ return (struct shmid_kernel *)ipc_rmid(ids,id); - } - --static inline int shm_addid(struct shmid_kernel *shp) -+static inline int shm_addid(struct shmid_kernel *shp, int reqid) - { -- return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni); -+ return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni, reqid); - } - - - --static inline void shm_inc (int id) { -+static inline void shm_inc(int id, struct super_block *sb) { - struct shmid_kernel *shp; - -- if(!(shp = shm_lock(id))) -+ if(!(shp = shm_lock_sb(id, sb))) - BUG(); - shp->shm_atim = get_seconds(); -- shp->shm_lprid = current->tgid; -+ shp->shm_lprid = virt_tgid(current); - shp->shm_nattch++; - shm_unlock(shp); - } -@@ -101,7 +169,50 @@ static inline void shm_inc (int id) { - /* This is called by fork, once for every shm attach. */ - static void shm_open (struct vm_area_struct *shmd) - { -- shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino); -+ shm_inc(shmd->vm_file->f_dentry->d_inode->i_ino, -+ shmd->vm_file->f_dentry->d_inode->i_sb); -+} -+ -+static int shmem_lock(struct shmid_kernel *shp, int lock, -+ struct user_struct *user) -+{ -+ struct file *file = shp->shm_file; -+ struct inode *inode = file->f_dentry->d_inode; -+ struct shmem_inode_info *info = SHMEM_I(inode); -+ unsigned long size; -+ -+ size = shp->shm_segsz + PAGE_SIZE - 1; -+ -+#ifdef CONFIG_SHMEM -+ spin_lock(&info->lock); -+ if (lock && !(info->flags & VM_LOCKED)) { -+ if (ub_lockedshm_charge(info, size) < 0) -+ goto out_ch; -+ -+ if (!user_shm_lock(inode->i_size, user)) -+ goto out_user; -+ info->flags |= VM_LOCKED; -+ } -+ if (!lock && (info->flags & VM_LOCKED) && user) { -+ ub_lockedshm_uncharge(info, size); -+ user_shm_unlock(inode->i_size, user); -+ info->flags &= ~VM_LOCKED; -+ } -+ spin_unlock(&info->lock); -+ return 0; -+ -+out_user: -+ ub_lockedshm_uncharge(info, size); -+out_ch: -+ spin_unlock(&info->lock); -+ return -ENOMEM; -+#else -+ if (lock && ub_lockedshm_charge(info, size)) -+ return -ENOMEM; -+ if (!lock) -+ ub_lockedshm_uncharge(info, size); -+ return 0; -+#endif - } - - /* -@@ -114,15 +225,24 @@ static void shm_open (struct vm_area_str - */ - static void shm_destroy (struct shmid_kernel *shp) - { -- shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; -- shm_rmid (shp->id); -+ int numpages, *shm_totalp; -+ struct file *f; -+ struct super_block *sb; -+ -+ f = shp->shm_file; -+ sb = f->f_dentry->d_inode->i_sb; -+ numpages = (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; -+ shm_totalp = shm_total_sb(sb); -+ *shm_totalp -= numpages; -+ -+ shm_rmid (shp->_shm_ids, shp->id); - shm_unlock(shp); - if (!is_file_hugepages(shp->shm_file)) -- shmem_lock(shp->shm_file, 0, shp->mlock_user); -+ shmem_lock(shp, 0, shp->mlock_user); - else - user_shm_unlock(shp->shm_file->f_dentry->d_inode->i_size, - shp->mlock_user); -- fput (shp->shm_file); -+ fput(f); - security_shm_free(shp); - ipc_rcu_putref(shp); - } -@@ -138,12 +258,24 @@ static void shm_close (struct vm_area_st - struct file * file = shmd->vm_file; - int id = file->f_dentry->d_inode->i_ino; - struct shmid_kernel *shp; -+ struct super_block *sb; -+ struct ipc_ids *ids; -+#ifdef CONFIG_VE -+ struct ve_struct *ve; -+ -+ sb = file->f_dentry->d_inode->i_sb; -+ ve = get_ve(sb_ve(sb)); -+ ids = ve->_shm_ids; -+#else -+ sb = file->f_dentry->d_inode->i_sb; -+ ids = &shm_ids; -+#endif - -- down (&shm_ids.sem); -+ down (&ids->sem); - /* remove from the list of attaches of the shm segment */ -- if(!(shp = shm_lock(id))) -+ if(!(shp = shm_lock_sb(id, sb))) - BUG(); -- shp->shm_lprid = current->tgid; -+ shp->shm_lprid = virt_tgid(current); - shp->shm_dtim = get_seconds(); - shp->shm_nattch--; - if(shp->shm_nattch == 0 && -@@ -151,7 +283,10 @@ static void shm_close (struct vm_area_st - shm_destroy (shp); - else - shm_unlock(shp); -- up (&shm_ids.sem); -+ up(&ids->sem); -+#ifdef CONFIG_VE -+ put_ve(ve); -+#endif - } - - static int shm_mmap(struct file * file, struct vm_area_struct * vma) -@@ -161,7 +296,10 @@ static int shm_mmap(struct file * file, - ret = shmem_mmap(file, vma); - if (ret == 0) { - vma->vm_ops = &shm_vm_ops; -- shm_inc(file->f_dentry->d_inode->i_ino); -+ if (!(vma->vm_flags & VM_WRITE)) -+ vma->vm_flags &= ~VM_MAYWRITE; -+ shm_inc(file->f_dentry->d_inode->i_ino, -+ file->f_dentry->d_inode->i_sb); - } - - return ret; -@@ -184,19 +322,19 @@ static struct vm_operations_struct shm_v - #endif - }; - --static int newseg (key_t key, int shmflg, size_t size) -+static int newseg (key_t key, int shmid, int shmflg, size_t size) - { - int error; - struct shmid_kernel *shp; - int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; - struct file * file; -- char name[13]; -+ char name[26]; - int id; - - if (size < SHMMIN || size > shm_ctlmax) - return -EINVAL; - -- if (shm_tot + numpages >= shm_ctlall) -+ if (shm_total + numpages >= shm_ctlall) - return -ENOSPC; - - shp = ipc_rcu_alloc(sizeof(*shp)); -@@ -227,7 +365,11 @@ static int newseg (key_t key, int shmflg - if ((shmflg & SHM_NORESERVE) && - sysctl_overcommit_memory != OVERCOMMIT_NEVER) - acctflag = 0; -+#ifdef CONFIG_VE -+ sprintf (name, "VE%d.SYSV%08x", get_exec_env()->veid, key); -+#else - sprintf (name, "SYSV%08x", key); -+#endif - file = shmem_file_setup(name, size, acctflag); - } - error = PTR_ERR(file); -@@ -235,17 +377,18 @@ static int newseg (key_t key, int shmflg - goto no_file; - - error = -ENOSPC; -- id = shm_addid(shp); -+ id = shm_addid(shp, shmid); - if(id == -1) - goto no_id; - -- shp->shm_cprid = current->tgid; -+ shp->shm_cprid = virt_tgid(current); - shp->shm_lprid = 0; - shp->shm_atim = shp->shm_dtim = 0; - shp->shm_ctim = get_seconds(); - shp->shm_segsz = size; - shp->shm_nattch = 0; - shp->id = shm_buildid(id,shp->shm_perm.seq); -+ shp->_shm_ids = &shm_ids; - shp->shm_file = file; - file->f_dentry->d_inode->i_ino = shp->id; - -@@ -253,7 +396,7 @@ static int newseg (key_t key, int shmflg - if (!(shmflg & SHM_HUGETLB)) - file->f_op = &shm_file_operations; - -- shm_tot += numpages; -+ shm_total += numpages; - shm_unlock(shp); - return shp->id; - -@@ -272,12 +415,12 @@ asmlinkage long sys_shmget (key_t key, s - - down(&shm_ids.sem); - if (key == IPC_PRIVATE) { -- err = newseg(key, shmflg, size); -+ err = newseg(key, -1, shmflg, size); - } else if ((id = ipc_findkey(&shm_ids, key)) == -1) { - if (!(shmflg & IPC_CREAT)) - err = -ENOENT; - else -- err = newseg(key, shmflg, size); -+ err = newseg(key, -1, shmflg, size); - } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) { - err = -EEXIST; - } else { -@@ -470,7 +613,7 @@ asmlinkage long sys_shmctl (int shmid, i - down(&shm_ids.sem); - shm_info.used_ids = shm_ids.in_use; - shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp); -- shm_info.shm_tot = shm_tot; -+ shm_info.shm_tot = shm_total; - shm_info.swap_attempts = 0; - shm_info.swap_successes = 0; - err = shm_ids.max_id; -@@ -557,14 +700,14 @@ asmlinkage long sys_shmctl (int shmid, i - if(cmd==SHM_LOCK) { - struct user_struct * user = current->user; - if (!is_file_hugepages(shp->shm_file)) { -- err = shmem_lock(shp->shm_file, 1, user); -+ err = shmem_lock(shp, 1, user); - if (!err) { - shp->shm_perm.mode |= SHM_LOCKED; - shp->mlock_user = user; - } - } - } else if (!is_file_hugepages(shp->shm_file)) { -- shmem_lock(shp->shm_file, 0, shp->mlock_user); -+ shmem_lock(shp, 0, shp->mlock_user); - shp->shm_perm.mode &= ~SHM_LOCKED; - shp->mlock_user = NULL; - } -@@ -594,7 +737,7 @@ asmlinkage long sys_shmctl (int shmid, i - - if (current->euid != shp->shm_perm.uid && - current->euid != shp->shm_perm.cuid && -- !capable(CAP_SYS_ADMIN)) { -+ !capable(CAP_VE_SYS_ADMIN)) { - err=-EPERM; - goto out_unlock_up; - } -@@ -603,13 +746,7 @@ asmlinkage long sys_shmctl (int shmid, i - if (err) - goto out_unlock_up; - -- if (shp->shm_nattch){ -- shp->shm_perm.mode |= SHM_DEST; -- /* Do not find it any more */ -- shp->shm_perm.key = IPC_PRIVATE; -- shm_unlock(shp); -- } else -- shm_destroy (shp); -+ do_shm_rmid(shp); - up(&shm_ids.sem); - goto out; - } -@@ -633,7 +770,7 @@ asmlinkage long sys_shmctl (int shmid, i - err=-EPERM; - if (current->euid != shp->shm_perm.uid && - current->euid != shp->shm_perm.cuid && -- !capable(CAP_SYS_ADMIN)) { -+ !capable(CAP_VE_SYS_ADMIN)) { - goto out_unlock_up; - } - -@@ -916,3 +1053,55 @@ static int sysvipc_shm_proc_show(struct - shp->shm_ctim); - } - #endif -+ -+#ifdef CONFIG_VZ_CHECKPOINT_MODULE -+#include <linux/module.h> -+ -+struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg) -+{ -+ struct shmid_kernel *shp; -+ struct file *file; -+ -+ down(&shm_ids.sem); -+ shp = shm_lock(shmid); -+ if (!shp) { -+ int err; -+ -+ err = newseg(key, shmid, shmflg, size); -+ file = ERR_PTR(err); -+ if (err < 0) -+ goto out; -+ shp = shm_lock(shmid); -+ } -+ file = ERR_PTR(-EINVAL); -+ if (shp) { -+ file = shp->shm_file; -+ get_file(file); -+ shm_unlock(shp); -+ } -+out: -+ up(&shm_ids.sem); -+ return file; -+} -+EXPORT_SYMBOL_GPL(sysvipc_setup_shm); -+ -+int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg) -+{ -+ int i; -+ int err = 0; -+ struct shmid_kernel* shp; -+ -+ down(&shm_ids.sem); -+ for(i = 0; i <= shm_ids.max_id; i++) { -+ if ((shp = shm_lock(i)) == NULL) -+ continue; -+ err = func(shp, arg); -+ shm_unlock(shp); -+ if (err) -+ break; -+ } -+ up(&shm_ids.sem); -+ return err; -+} -+EXPORT_SYMBOL_GPL(sysvipc_walk_shm); -+#endif -diff -upr linux-2.6.16.orig/ipc/util.c linux-2.6.16-026test009/ipc/util.c ---- linux-2.6.16.orig/ipc/util.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/ipc/util.c 2006-04-19 15:02:13.000000000 +0400 -@@ -13,6 +13,7 @@ - */ - - #include <linux/config.h> -+#include <linux/module.h> - #include <linux/mm.h> - #include <linux/shm.h> - #include <linux/init.h> -@@ -30,6 +31,8 @@ - - #include <asm/unistd.h> - -+#include <ub/ub_mem.h> -+ - #include "util.h" - - struct ipc_proc_iface { -@@ -65,7 +68,7 @@ __initcall(ipc_init); - * array itself. - */ - --void __init ipc_init_ids(struct ipc_ids* ids, int size) -+void __ve_init ipc_init_ids(struct ipc_ids* ids, int size) - { - int i; - sema_init(&ids->sem,1); -@@ -94,7 +97,21 @@ void __init ipc_init_ids(struct ipc_ids* - ids->entries->size = size; - for(i=0;i<size;i++) - ids->entries->p[i] = NULL; -+ -+ ids->owner_env = get_exec_env(); -+} -+ -+#ifdef CONFIG_VE -+static inline void ipc_free_ids(struct ipc_ids *ids) -+{ -+ if (ids == NULL) -+ return; -+ -+ if (ids->entries != &ids->nullentry) -+ ipc_rcu_putref(ids->entries); -+ kfree(ids); - } -+#endif - - #ifdef CONFIG_PROC_FS - static struct file_operations sysvipc_proc_fops; -@@ -182,8 +199,7 @@ static int grow_ary(struct ipc_ids* ids, - if(new == NULL) - return size; - new->size = newsize; -- memcpy(new->p, ids->entries->p, sizeof(struct kern_ipc_perm *)*size + -- sizeof(struct ipc_id_ary)); -+ memcpy(new->p, ids->entries->p, sizeof(struct kern_ipc_perm *)*size); - for(i=size;i<newsize;i++) { - new->p[i] = NULL; - } -@@ -213,10 +229,20 @@ static int grow_ary(struct ipc_ids* ids, - * Called with ipc_ids.sem held. - */ - --int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) -+int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid) - { - int id; - -+ if (reqid >= 0) { -+ id = reqid%SEQ_MULTIPLIER; -+ size = grow_ary(ids,id+1); -+ if (id >= size) -+ return -1; -+ if (ids->entries->p[id] == NULL) -+ goto found; -+ return -1; -+ } -+ - size = grow_ary(ids,size); - - /* -@@ -229,16 +255,21 @@ int ipc_addid(struct ipc_ids* ids, struc - } - return -1; - found: -- ids->in_use++; -+ if (ids->in_use++ == 0) -+ (void)get_ve(ids->owner_env); - if (id > ids->max_id) - ids->max_id = id; - - new->cuid = new->uid = current->euid; - new->gid = new->cgid = current->egid; - -- new->seq = ids->seq++; -- if(ids->seq > ids->seq_max) -- ids->seq = 0; -+ if (reqid >= 0) { -+ new->seq = reqid/SEQ_MULTIPLIER; -+ } else { -+ new->seq = ids->seq++; -+ if(ids->seq > ids->seq_max) -+ ids->seq = 0; -+ } - - spin_lock_init(&new->lock); - new->deleted = 0; -@@ -276,7 +307,8 @@ struct kern_ipc_perm* ipc_rmid(struct ip - ids->entries->p[lid] = NULL; - if(p==NULL) - BUG(); -- ids->in_use--; -+ if (--ids->in_use == 0) -+ put_ve(ids->owner_env); - - if (lid == ids->max_id) { - do { -@@ -302,9 +334,9 @@ void* ipc_alloc(int size) - { - void* out; - if(size > PAGE_SIZE) -- out = vmalloc(size); -+ out = ub_vmalloc(size); - else -- out = kmalloc(size, GFP_KERNEL); -+ out = ub_kmalloc(size, GFP_KERNEL); - return out; - } - -@@ -387,14 +419,14 @@ void* ipc_rcu_alloc(int size) - * workqueue if necessary (for vmalloc). - */ - if (rcu_use_vmalloc(size)) { -- out = vmalloc(HDRLEN_VMALLOC + size); -+ out = ub_vmalloc(HDRLEN_VMALLOC + size); - if (out) { - out += HDRLEN_VMALLOC; - container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; - container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; - } - } else { -- out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); -+ out = ub_kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); - if (out) { - out += HDRLEN_KMALLOC; - container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; -@@ -603,6 +635,71 @@ int ipc_checkid(struct ipc_ids* ids, str - return 0; - } - -+#ifdef CONFIG_VE -+void __init prepare_ipc(void) -+{ -+ prepare_msg(); -+ prepare_sem(); -+ prepare_shm(); -+} -+ -+int init_ve_ipc(struct ve_struct * envid) -+{ -+ envid->_msg_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), -+ GFP_KERNEL); -+ if (envid->_msg_ids == NULL) -+ goto out_nomem; -+ envid->_sem_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), -+ GFP_KERNEL); -+ if (envid->_sem_ids == NULL) -+ goto out_free_msg; -+ envid->_shm_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), -+ GFP_KERNEL); -+ if (envid->_shm_ids == NULL) -+ goto out_free_sem; -+ -+ init_ve_ipc_msg(); -+ init_ve_ipc_sem(); -+ init_ve_ipc_shm(); -+ return 0; -+ -+out_free_sem: -+ kfree(envid->_sem_ids); -+out_free_msg: -+ kfree(envid->_msg_ids); -+out_nomem: -+ return -ENOMEM; -+} -+ -+void ve_ipc_cleanup(void) -+{ -+ cleanup_ve_ipc_msg(); -+ cleanup_ve_ipc_sem(); -+ cleanup_ve_ipc_shm(); -+} -+ -+void ve_ipc_free(struct ve_struct *env) -+{ -+ ipc_free_ids(env->_msg_ids); -+ ipc_free_ids(env->_sem_ids); -+ ipc_free_ids(env->_shm_ids); -+ env->_msg_ids = NULL; -+ env->_sem_ids = NULL; -+ env->_shm_ids = NULL; -+} -+ -+void fini_ve_ipc(struct ve_struct *ptr) -+{ -+ ve_ipc_cleanup(); -+ ve_ipc_free(ptr); -+} -+ -+EXPORT_SYMBOL(init_ve_ipc); -+EXPORT_SYMBOL(ve_ipc_cleanup); -+EXPORT_SYMBOL(ve_ipc_free); -+EXPORT_SYMBOL(fini_ve_ipc); -+#endif /* CONFIG_VE */ -+ - #ifdef __ARCH_WANT_IPC_PARSE_VERSION - - -diff -upr linux-2.6.16.orig/ipc/util.h linux-2.6.16-026test009/ipc/util.h ---- linux-2.6.16.orig/ipc/util.h 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/ipc/util.h 2006-04-19 15:02:12.000000000 +0400 -@@ -15,6 +15,22 @@ void sem_init (void); - void msg_init (void); - void shm_init (void); - -+#ifdef CONFIG_VE -+void prepare_msg(void); -+void prepare_sem(void); -+void prepare_shm(void); -+void init_ve_ipc_msg(void); -+void init_ve_ipc_sem(void); -+void init_ve_ipc_shm(void); -+void cleanup_ve_ipc_msg(void); -+void cleanup_ve_ipc_sem(void); -+void cleanup_ve_ipc_shm(void); -+ -+#define __ve_init -+#else -+#define __ve_init __init -+#endif -+ - struct ipc_id_ary { - int size; - struct kern_ipc_perm *p[0]; -@@ -28,10 +44,11 @@ struct ipc_ids { - struct semaphore sem; - struct ipc_id_ary nullentry; - struct ipc_id_ary* entries; -+ struct ve_struct *owner_env; - }; - - struct seq_file; --void __init ipc_init_ids(struct ipc_ids* ids, int size); -+void __ve_init ipc_init_ids(struct ipc_ids *ids, int size); - #ifdef CONFIG_PROC_FS - void __init ipc_init_proc_interface(const char *path, const char *header, - struct ipc_ids *ids, -@@ -42,7 +59,7 @@ void __init ipc_init_proc_interface(cons - - /* must be called with ids->sem acquired.*/ - int ipc_findkey(struct ipc_ids* ids, key_t key); --int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size); -+int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid); - - /* must be called with both locks acquired. */ - struct kern_ipc_perm* ipc_rmid(struct ipc_ids* ids, int id); -diff -upr linux-2.6.16.orig/kernel/Kconfig.openvz linux-2.6.16-026test009/kernel/Kconfig.openvz ---- linux-2.6.16.orig/kernel/Kconfig.openvz 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/Kconfig.openvz 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,54 @@ -+# Copyright (C) 2005 SWsoft -+# All rights reserved. -+# Licensing governed by "linux/COPYING.SWsoft" file. -+ -+menu "OpenVZ" -+ -+config VE -+ bool "Virtual Environment support" -+ default y -+ help -+ This option adds support of virtual Linux running on the original box -+ with fully supported virtual network driver, tty subsystem and -+ configurable access for hardware and other resources. -+ -+config VE_CALLS -+ tristate "VE calls interface" -+ depends on VE -+ default m -+ help -+ This option controls how to build vzmon code containing VE calls. -+ By default it's build in module vzmon.o -+ -+config VE_NETDEV -+ tristate "VE networking" -+ depends on VE -+ default m -+ help -+ This option controls whether to build VE networking code. -+ -+config VE_IPTABLES -+ bool "VE netfiltering" -+ depends on VE && VE_NETDEV && INET && NETFILTER -+ default y -+ help -+ This option controls whether to build VE netfiltering code. -+ -+config VZ_WDOG -+ tristate "VE watchdog module" -+ depends on VE -+ default m -+ help -+ This option controls building of vzwdog module, which dumps -+ a lot of useful system info on console periodically. -+ -+config VZ_CHECKPOINT -+ tristate "Checkpointing & restoring Virtual Environments" -+ depends on SOFTWARE_SUSPEND -+ default m -+ help -+ This option adds two modules, "cpt" and "rst", which allow -+ to save a running Virtual Environment and restore it -+ on another host (live migration) or on the same host (checkpointing). -+ -+endmenu -diff -upr linux-2.6.16.orig/kernel/Makefile linux-2.6.16-026test009/kernel/Makefile ---- linux-2.6.16.orig/kernel/Makefile 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/Makefile 2006-04-19 15:02:12.000000000 +0400 -@@ -10,6 +10,18 @@ obj-y = sched.o fork.o exec_domain.o - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o - -+obj-y += ub/ -+ -+obj-$(CONFIG_VE) += ve.o -+obj-$(CONFIG_VE) += veowner.o -+obj-$(CONFIG_VE_CALLS) += vzdev.o -+obj-$(CONFIG_VZ_WDOG) += vzwdog.o -+obj-$(CONFIG_VE_CALLS) += vzmon.o -+ -+vzmon-objs = vecalls.o -+ -+obj-$(CONFIG_VZ_CHECKPOINT) += cpt/ -+ - obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o - obj-$(CONFIG_FUTEX) += futex.o - obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -diff -upr linux-2.6.16.orig/kernel/audit.c linux-2.6.16-026test009/kernel/audit.c ---- linux-2.6.16.orig/kernel/audit.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/audit.c 2006-04-19 15:02:12.000000000 +0400 -@@ -372,6 +372,9 @@ static int audit_receive_msg(struct sk_b - uid_t loginuid; /* loginuid of sender */ - struct audit_sig_info sig_data; - -+ if (!ve_is_super(VE_OWNER_SKB(skb))) -+ return -ECONNREFUSED; -+ - err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); - if (err) - return err; -diff -upr linux-2.6.16.orig/kernel/capability.c linux-2.6.16-026test009/kernel/capability.c ---- linux-2.6.16.orig/kernel/capability.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/capability.c 2006-04-19 15:02:12.000000000 +0400 -@@ -25,6 +25,7 @@ EXPORT_SYMBOL(cap_bset); - * Locking rule: acquire this prior to tasklist_lock. - */ - static DEFINE_SPINLOCK(task_capability_lock); -+EXPORT_SYMBOL(task_capability_lock); - - /* - * For sys_getproccap() and sys_setproccap(), any of the three -@@ -67,8 +68,8 @@ asmlinkage long sys_capget(cap_user_head - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - -- if (pid && pid != current->pid) { -- target = find_task_by_pid(pid); -+ if (pid && pid != virt_pid(current)) { -+ target = find_task_by_pid_ve(pid); - if (!target) { - ret = -ESRCH; - goto out; -@@ -100,9 +101,13 @@ static inline int cap_set_pg(int pgrp, k - int ret = -EPERM; - int found = 0; - -- do_each_task_pid(pgrp, PIDTYPE_PGID, g) { -+ pgrp = vpid_to_pid(pgrp); -+ if (pgrp < 0) -+ return ret; -+ -+ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, g) { - target = g; -- while_each_thread(g, target) { -+ while_each_thread_ve(g, target) { - if (!security_capset_check(target, effective, - inheritable, - permitted)) { -@@ -113,7 +118,7 @@ static inline int cap_set_pg(int pgrp, k - } - found = 1; - } -- } while_each_task_pid(pgrp, PIDTYPE_PGID, g); -+ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, g); - - if (!found) - ret = 0; -@@ -132,7 +137,7 @@ static inline int cap_set_all(kernel_cap - int ret = -EPERM; - int found = 0; - -- do_each_thread(g, target) { -+ do_each_thread_ve(g, target) { - if (target == current || target->pid == 1) - continue; - found = 1; -@@ -141,7 +146,7 @@ static inline int cap_set_all(kernel_cap - continue; - ret = 0; - security_capset_set(target, effective, inheritable, permitted); -- } while_each_thread(g, target); -+ } while_each_thread_ve(g, target); - - if (!found) - ret = 0; -@@ -188,7 +193,7 @@ asmlinkage long sys_capset(cap_user_head - if (get_user(pid, &header->pid)) - return -EFAULT; - -- if (pid && pid != current->pid && !capable(CAP_SETPCAP)) -+ if (pid && pid != virt_pid(current) && !capable(CAP_SETPCAP)) - return -EPERM; - - if (copy_from_user(&effective, &data->effective, sizeof(effective)) || -@@ -199,8 +204,8 @@ asmlinkage long sys_capset(cap_user_head - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - -- if (pid > 0 && pid != current->pid) { -- target = find_task_by_pid(pid); -+ if (pid > 0 && pid != virt_pid(current)) { -+ target = find_task_by_pid_ve(pid); - if (!target) { - ret = -ESRCH; - goto out; -diff -upr linux-2.6.16.orig/kernel/compat.c linux-2.6.16-026test009/kernel/compat.c ---- linux-2.6.16.orig/kernel/compat.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/compat.c 2006-04-19 15:02:12.000000000 +0400 -@@ -21,6 +21,8 @@ - #include <linux/syscalls.h> - #include <linux/unistd.h> - #include <linux/security.h> -+#include <linux/hrtimer.h> -+#include <linux/module.h> - - #include <asm/uaccess.h> - -@@ -38,61 +40,73 @@ int put_compat_timespec(const struct tim - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; - } - --static long compat_nanosleep_restart(struct restart_block *restart) -+long compat_nanosleep_restart(struct restart_block *restart) - { -- unsigned long expire = restart->arg0, now = jiffies; - struct compat_timespec __user *rmtp; -+ struct timespec tu; -+ void *rfn_save = restart->fn; -+ struct hrtimer timer; -+ ktime_t rem; - -- /* Did it expire while we handled signals? */ -- if (!time_after(expire, now)) -- return 0; -+ restart->fn = do_no_restart_syscall; -+ -+ hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); -+ -+ timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; - -- expire = schedule_timeout_interruptible(expire - now); -- if (expire == 0) -+ set_current_state(TASK_INTERRUPTIBLE); -+ rem = schedule_hrtimer(&timer, HRTIMER_ABS); -+ -+ if (rem.tv64 <= 0) - return 0; - -- rmtp = (struct compat_timespec __user *)restart->arg1; -- if (rmtp) { -- struct compat_timespec ct; -- struct timespec t; -- -- jiffies_to_timespec(expire, &t); -- ct.tv_sec = t.tv_sec; -- ct.tv_nsec = t.tv_nsec; -- if (copy_to_user(rmtp, &ct, sizeof(ct))) -- return -EFAULT; -- } -- /* The 'restart' block is already filled in */ -+ rmtp = (struct compat_timespec __user *) restart->arg2; -+ tu = ktime_to_timespec(rem); -+ if (rmtp && put_compat_timespec(&tu, rmtp)) -+ return -EFAULT; -+ -+ restart->fn = rfn_save; -+ -+ /* The other values in restart are already filled in */ - return -ERESTART_RESTARTBLOCK; - } -+EXPORT_SYMBOL_GPL(compat_nanosleep_restart); - - asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) - { - struct timespec t; - struct restart_block *restart; -- unsigned long expire; -+ struct hrtimer timer; -+ ktime_t rem; - - if (get_compat_timespec(&t, rqtp)) - return -EFAULT; - -- if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) -+ if (!timespec_valid(&t)) - return -EINVAL; - -- expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); -- expire = schedule_timeout_interruptible(expire); -- if (expire == 0) -+ hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL); -+ -+ timer.expires = timespec_to_ktime(t); -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ rem = schedule_hrtimer(&timer, HRTIMER_REL); -+ if (rem.tv64 <= 0) - return 0; - -- if (rmtp) { -- jiffies_to_timespec(expire, &t); -- if (put_compat_timespec(&t, rmtp)) -- return -EFAULT; -- } -+ t = ktime_to_timespec(rem); -+ -+ if (rmtp && put_compat_timespec(&t, rmtp)) -+ return -EFAULT; -+ - restart = ¤t_thread_info()->restart_block; - restart->fn = compat_nanosleep_restart; -- restart->arg0 = jiffies + expire; -- restart->arg1 = (unsigned long) rmtp; -+ restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; -+ restart->arg1 = timer.expires.tv64 >> 32; -+ restart->arg2 = (unsigned long) rmtp; -+ restart->arg3 = (unsigned long) timer.base->index; -+ - return -ERESTART_RESTARTBLOCK; - } - -diff -upr linux-2.6.16.orig/kernel/configs.c linux-2.6.16-026test009/kernel/configs.c ---- linux-2.6.16.orig/kernel/configs.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/configs.c 2006-04-19 15:02:12.000000000 +0400 -@@ -89,8 +89,7 @@ static int __init ikconfig_init(void) - struct proc_dir_entry *entry; - - /* create the current config file */ -- entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, -- &proc_root); -+ entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, NULL); - if (!entry) - return -ENOMEM; - -diff -upr linux-2.6.16.orig/kernel/cpt/Makefile linux-2.6.16-026test009/kernel/cpt/Makefile ---- linux-2.6.16.orig/kernel/cpt/Makefile 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/Makefile 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,34 @@ -+# -+# -+# kernel/cpt/Makefile -+# -+# Copyright (C) 2000-2005 SWsoft -+# All rights reserved. -+# -+# Licensing governed by "linux/COPYING.SWsoft" file. -+ -+obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o -+ -+vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \ -+ cpt_mm.o cpt_files.o cpt_kernel.o \ -+ cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \ -+ cpt_conntrack.o cpt_ubc.o cpt_epoll.o -+ -+vzrst-objs := rst_proc.o rst_undump.o cpt_obj.o rst_context.o rst_process.o \ -+ rst_mm.o rst_files.o cpt_kernel.o \ -+ rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \ -+ rst_conntrack.o rst_ubc.o rst_epoll.o -+ -+ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y) -+vzcpt-objs += cpt_pagein.o -+vzrst-objs += rst_pagein.o -+endif -+ -+ifeq ($(CONFIG_X86_64), y) -+vzcpt-objs += cpt_x8664.o -+vzrst-objs += cpt_x8664.o rst_x8664.o -+endif -+ -+ifeq ($(CONFIG_X86_32), y) -+vzrst-objs += rst_i386.o -+endif -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_conntrack.c linux-2.6.16-026test009/kernel/cpt/cpt_conntrack.c ---- linux-2.6.16.orig/kernel/cpt/cpt_conntrack.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_conntrack.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,370 @@ -+/* -+ * -+ * kernel/cpt/cpt_conntrack.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/fs.h> -+#include <linux/socket.h> -+#include <linux/netdevice.h> -+#include <linux/inetdevice.h> -+#include <linux/rtnetlink.h> -+#include <linux/unistd.h> -+#include <linux/ve.h> -+#include <linux/vzcalluser.h> -+#include <linux/cpt_image.h> -+#include <linux/icmp.h> -+#include <linux/ip.h> -+ -+#if defined(CONFIG_VE_IPTABLES) && \ -+ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) -+ -+#include <linux/netfilter.h> -+#include <linux/netfilter_ipv4/ip_conntrack.h> -+#include <linux/netfilter_ipv4/ip_nat.h> -+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -+#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -+#include <linux/netfilter_ipv4/ip_conntrack_core.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+ -+/* How does it work? -+ * -+ * Network is disabled, so new conntrack entries will not appear. -+ * However, some of them can disappear because of timeouts. -+ * -+ * So, we take read_lock, collect all required information atomically, -+ * essentially, creating parallel "refcount" structures holding pointers. -+ * We delete conntrack timers as well, so the structures cannot disappear -+ * after releasing the lock. Now, after releasing lock we can dump everything -+ * safely. And on exit we restore timers to their original values. -+ * -+ * Note, this approach is not going to work in VE0. -+ */ -+ -+struct ct_holder -+{ -+ struct ct_holder *next; -+ struct ip_conntrack_tuple_hash *cth; -+ int index; -+}; -+ -+static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple) -+{ -+ v->cpt_dst = tuple->dst.ip; -+ v->cpt_dstport = tuple->dst.u.all; -+ v->cpt_protonum = tuple->dst.protonum; -+ v->cpt_dir = tuple->dst.dir; -+ -+ v->cpt_src = tuple->src.ip; -+ v->cpt_srcport = tuple->src.u.all; -+} -+ -+static int dump_one_expect(struct cpt_ip_connexpect_image *v, -+ struct ip_conntrack_expect *exp, -+ int sibling, cpt_context_t *ctx) -+{ -+ int err = 0; -+ -+ v->cpt_next = sizeof(*v); -+ v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_VOID; -+ -+ encode_tuple(&v->cpt_tuple, &exp->tuple); -+ encode_tuple(&v->cpt_mask, &exp->mask); -+ v->cpt_sibling_conntrack = sibling; -+ v->cpt_flags = exp->flags; -+ v->cpt_seq = exp->id; -+ v->cpt_dir = 0; -+ v->cpt_manip_proto = 0; -+#ifdef CONFIG_IP_NF_NAT_NEEDED -+ v->cpt_manip_proto = exp->saved_proto.all; -+ v->cpt_dir = exp->dir; -+#endif -+ v->cpt_timeout = 0; -+ if (exp->master->helper->timeout) -+ v->cpt_timeout = exp->timeout.expires - jiffies; -+ return err; -+} -+ -+/* NOTE. We use one page to dump list of expectations. This may be not enough -+ * in theory. In practice there is only one expectation per conntrack record. -+ * Moreover, taking into account that _ALL_ of expecations are saved in one -+ * global list, which is looked up each incoming/outpging packet, the system -+ * would be severely dead when even one conntrack would have so much of -+ * expectations. Shortly, I am not going to repair this. -+ */ -+ -+static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list, -+ cpt_context_t *ctx) -+{ -+ int err = 0; -+ unsigned long pg; -+ struct cpt_ip_connexpect_image *v; -+ struct ip_conntrack_expect *exp; -+ -+ if (ct->expecting == 0) -+ return err; -+ if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE) -+ return -ENOBUFS; -+ -+ pg = __get_free_page(GFP_KERNEL); -+ if (!pg) -+ return -ENOMEM; -+ v = (struct cpt_ip_connexpect_image *)pg; -+ -+ read_lock_bh(&ip_conntrack_lock); -+ list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) { -+ int sibling; -+ -+ if (exp->master != ct) -+ continue; -+ -+ if (ct->helper == NULL) { -+ eprintk_ctx("conntrack: no helper and non-trivial expectation\n"); -+ err = -EINVAL; -+ break; -+ } -+ -+ sibling = 0; -+#if 0 -+ /* That's all? No need to calculate sibling? */ -+ if (exp->sibling) { -+ struct ct_holder *c; -+ for (c = list; c; c = c->next) { -+ if (tuplehash_to_ctrack(c->cth) == exp->sibling) { -+ sibling = c->index; -+ break; -+ } -+ } -+ /* NOTE: exp->sibling could be not "confirmed" and, hence, -+ * out of hash table. We should just ignore such a sibling, -+ * the connection is going to be retried, the packet -+ * apparently was lost somewhere. -+ */ -+ if (sibling == 0) -+ dprintk_ctx("sibling conntrack is not found\n"); -+ } -+#endif -+ -+ /* If the expectation still does not have exp->sibling -+ * and timer is not running, it is about to die on another -+ * cpu. Skip it. */ -+ if (!sibling && -+ ct->helper->timeout && -+ !timer_pending(&exp->timeout)) { -+ dprintk_ctx("conntrack: expectation: no timer\n"); -+ continue; -+ } -+ -+ err = dump_one_expect(v, exp, sibling, ctx); -+ if (err) -+ break; -+ -+ v++; -+ } -+ read_unlock_bh(&ip_conntrack_lock); -+ -+ if (err == 0 && (unsigned long)v != pg) -+ ctx->write((void*)pg, (unsigned long)v - pg, ctx); -+ -+ free_page(pg); -+ return err; -+} -+ -+static int dump_one_ct(struct ct_holder *c, struct ct_holder *list, -+ cpt_context_t *ctx) -+{ -+ struct ip_conntrack_tuple_hash *h = c->cth; -+ struct ip_conntrack *ct = tuplehash_to_ctrack(h); -+ struct cpt_ip_conntrack_image v; -+ int err = 0; -+ -+ if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) { -+ eprintk_ctx("conntrack module ct->proto version mismatch\n"); -+ return -EINVAL; -+ } -+ if (sizeof(v.cpt_help_data) != sizeof(ct->help)) { -+ eprintk_ctx("conntrack module ct->help version mismatch\n"); -+ return -EINVAL; -+ } -+ -+ cpt_open_object(NULL, ctx); -+ -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_NET_CONNTRACK; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_ARRAY; -+ -+ read_lock_bh(&ip_conntrack_lock); -+ v.cpt_status = ct->status; -+ v.cpt_timeout = ct->timeout.expires - jiffies; -+ v.cpt_ct_helper = (ct->helper != NULL); -+ v.cpt_index = c->index; -+ v.cpt_id = ct->id; -+ v.cpt_mark = 0; -+#if defined(CONFIG_IP_NF_CONNTRACK_MARK) -+ v.cpt_mark = ct->mark; -+#endif -+ encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple); -+ encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple); -+ memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data)); -+ memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data)); -+ -+ v.cpt_masq_index = 0; -+ v.cpt_initialized = 0; -+ v.cpt_num_manips = 0; -+ v.cpt_nat_helper = 0; -+#ifdef CONFIG_IP_NF_NAT_NEEDED -+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ -+ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) -+ v.cpt_masq_index = ct->nat.masq_index; -+#endif -+ /* "help" data is used by pptp, difficult to support */ -+ v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos; -+ v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before; -+ v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after; -+ v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos; -+ v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before; -+ v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after; -+#endif -+ read_unlock_bh(&ip_conntrack_lock); -+ -+ ctx->write(&v, sizeof(v), ctx); -+ -+ err = dump_expect_list(ct, list, ctx); -+ -+ cpt_close_object(ctx); -+ return err; -+} -+ -+int cpt_dump_ip_conntrack(cpt_context_t * ctx) -+{ -+ struct ct_holder *ct_list = NULL; -+ struct ct_holder *c, **cp; -+ int err = 0; -+ int index = 0; -+ int idx; -+ -+ if (get_exec_env()->_ip_conntrack == NULL) -+ return 0; -+ -+ for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) { -+ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); -+ if (c == NULL) { -+ err = -ENOMEM; -+ goto done; -+ } -+ memset(c, 0, sizeof(struct ct_holder)); -+ c->next = ct_list; -+ ct_list = c; -+ } -+ -+ c = ct_list; -+ -+ read_lock_bh(&ip_conntrack_lock); -+ for (idx = 0; idx < ip_conntrack_htable_size; idx++) { -+ struct ip_conntrack_tuple_hash *h; -+ list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) { -+ /* Skip reply tuples, they are covered by original -+ * direction. */ -+ if (DIRECTION(h)) -+ continue; -+ -+ /* Oops, we have not enough of holders... -+ * It is impossible. */ -+ if (unlikely(c == NULL)) { -+ read_unlock_bh(&ip_conntrack_lock); -+ eprintk_ctx("unexpected conntrack appeared\n"); -+ err = -ENOMEM; -+ goto done; -+ } -+ -+ /* If timer is not running, it means that it -+ * has just been scheduled on another cpu. -+ * We should skip this conntrack, it is about to be -+ * destroyed. */ -+ if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) { -+ dprintk_ctx("conntrack: no timer\n"); -+ continue; -+ } -+ -+ /* Timer is deleted. refcnt is _not_ decreased. -+ * We are going to restore the timer on exit -+ * from this function. */ -+ c->cth = h; -+ c->index = ++index; -+ c = c->next; -+ } -+ } -+ read_unlock_bh(&ip_conntrack_lock); -+ -+ /* No conntracks? Good. */ -+ if (index == 0) -+ goto done; -+ -+ /* Comb the list a little. */ -+ cp = &ct_list; -+ while ((c = *cp) != NULL) { -+ /* Discard unused entries; they can appear, if some -+ * entries were timed out since we preallocated the list. -+ */ -+ if (c->cth == NULL) { -+ *cp = c->next; -+ kfree(c); -+ continue; -+ } -+ -+ /* Move conntracks attached to expectations to the beginning -+ * of the list. */ -+ if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) { -+ *cp = c->next; -+ c->next = ct_list; -+ ct_list = c; -+ dprintk_ctx("conntrack: %d moved in list\n", c->index); -+ continue; -+ } -+ cp = &c->next; -+ } -+ -+ cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK); -+ -+ for (c = ct_list; c; c = c->next) { -+ err = dump_one_ct(c, ct_list, ctx); -+ if (err) -+ goto done; -+ } -+ -+ cpt_close_section(ctx); -+ -+done: -+ while ((c = ct_list) != NULL) { -+ ct_list = c->next; -+ if (c->cth) { -+ /* Restore timer. refcnt is preserved. */ -+ add_timer(&tuplehash_to_ctrack(c->cth)->timeout); -+ } -+ kfree(c); -+ } -+ return err; -+} -+ -+#endif -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_context.c linux-2.6.16-026test009/kernel/cpt/cpt_context.c ---- linux-2.6.16.orig/kernel/cpt/cpt_context.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_context.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,255 @@ -+/* -+ * -+ * kernel/cpt/cpt_context.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/pagemap.h> -+ -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+ -+static void file_write(const void *addr, size_t count, struct cpt_context *ctx) -+{ -+ mm_segment_t oldfs; -+ ssize_t err = -EBADF; -+ struct file *file = ctx->file; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ if (file) -+ err = file->f_op->write(file, addr, count, &file->f_pos); -+ set_fs(oldfs); -+ if (err != count && !ctx->write_error) -+ ctx->write_error = err < 0 ? err : -EIO; -+} -+ -+static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) -+{ -+ mm_segment_t oldfs; -+ ssize_t err = -EBADF; -+ struct file *file = ctx->file; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ if (file) -+ err = file->f_op->write(file, addr, count, &pos); -+ set_fs(oldfs); -+ if (err != count && !ctx->write_error) -+ ctx->write_error = err < 0 ? err : -EIO; -+} -+ -+static void file_align(struct cpt_context *ctx) -+{ -+ struct file *file = ctx->file; -+ -+ if (file) -+ file->f_pos = CPT_ALIGN(file->f_pos); -+} -+ -+void cpt_context_init(struct cpt_context *ctx) -+{ -+ int i; -+ -+ memset(ctx, 0, sizeof(*ctx)); -+ -+ init_MUTEX(&ctx->main_sem); -+ ctx->refcount = 1; -+ -+ ctx->current_section = -1; -+ ctx->current_object = -1; -+ ctx->pagesize = PAGE_SIZE; -+ ctx->write = file_write; -+ ctx->pwrite = file_pwrite; -+ ctx->align = file_align; -+ for (i=0; i < CPT_SECT_MAX; i++) -+ ctx->sections[i] = CPT_NULL; -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ init_completion(&ctx->pgin_notify); -+#endif -+ cpt_object_init(ctx); -+} -+ -+int cpt_open_dumpfile(struct cpt_context *ctx) -+{ -+ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); -+ if (ctx->tmpbuf == NULL) -+ return -ENOMEM; -+ __cpt_release_buf(ctx); -+ return 0; -+} -+ -+int cpt_close_dumpfile(struct cpt_context *ctx) -+{ -+ if (ctx->file) { -+ fput(ctx->file); -+ ctx->file = NULL; -+ } -+ if (ctx->tmpbuf) { -+ free_page((unsigned long)ctx->tmpbuf); -+ ctx->tmpbuf = NULL; -+ } -+ if (ctx->write_error) -+ eprintk_ctx("error while writing dump file: %d\n", ctx->write_error); -+ return ctx->write_error; -+} -+ -+int cpt_major_hdr_out(struct cpt_context *ctx) -+{ -+ struct cpt_major_hdr hdr; -+ -+ if (ctx->file == NULL) -+ return 0; -+ -+ memset(&hdr, 0, sizeof(hdr)); -+ hdr.cpt_signature[0] = CPT_SIGNATURE0; -+ hdr.cpt_signature[1] = CPT_SIGNATURE1; -+ hdr.cpt_signature[2] = CPT_SIGNATURE2; -+ hdr.cpt_signature[3] = CPT_SIGNATURE3; -+ hdr.cpt_hdrlen = sizeof(hdr); -+ hdr.cpt_image_version = 1; -+#ifdef CONFIG_X86_32 -+ hdr.cpt_os_arch = CPT_OS_ARCH_I386; -+#endif -+#ifdef CONFIG_X86_64 -+ hdr.cpt_os_arch = CPT_OS_ARCH_EMT64; -+#endif -+ hdr.cpt_os_version = 0; -+ hdr.cpt_os_features = 0; -+ hdr.cpt_pagesize = PAGE_SIZE; -+ hdr.cpt_hz = HZ; -+ hdr.cpt_start_jiffies64 = ctx->virt_jiffies64; -+ hdr.cpt_start_sec = ctx->start_time.tv_sec; -+ hdr.cpt_start_nsec = ctx->start_time.tv_nsec; -+ hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags; -+ hdr.cpt_kernel_config[0] = ctx->kernel_config_flags; -+ hdr.cpt_iptables_mask = ctx->iptables_mask; -+ -+ ctx->write(&hdr, sizeof(hdr), ctx); -+ return 0; -+} -+ -+int cpt_close_section(struct cpt_context *ctx) -+{ -+ if (ctx->file && ctx->current_section >= 0) { -+ __u64 next = ctx->file->f_pos - ctx->current_section; -+ ctx->pwrite(&next, 8, ctx, ctx->current_section); -+ ctx->current_section = -1; -+ } -+ return 0; -+} -+EXPORT_SYMBOL(cpt_close_section); -+ -+int cpt_open_section(struct cpt_context *ctx, __u32 type) -+{ -+ struct cpt_section_hdr hdr; -+ -+ if (ctx->file == NULL) -+ return 0; -+ -+ cpt_close_section(ctx); -+ -+ ctx->current_section = ctx->file->f_pos; -+ ctx->sections[type] = ctx->current_section; -+ -+ hdr.cpt_next = 0; -+ hdr.cpt_section = type; -+ hdr.cpt_hdrlen = sizeof(hdr); -+ hdr.cpt_align = 0; -+ ctx->write(&hdr, sizeof(hdr), ctx); -+ -+ return 0; -+} -+EXPORT_SYMBOL(cpt_open_section); -+ -+ -+int cpt_close_object(struct cpt_context *ctx) -+{ -+ if (ctx->file && ctx->current_object >= 0) { -+ __u64 next = ctx->file->f_pos - ctx->current_object; -+ ctx->pwrite(&next, 8, ctx, ctx->current_object); -+ ctx->current_object = -1; -+ } -+ return 0; -+} -+EXPORT_SYMBOL(cpt_close_object); -+ -+int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ if (ctx->file == NULL) -+ return 0; -+ -+ cpt_close_object(ctx); -+ -+ ctx->current_object = ctx->file->f_pos; -+ if (obj) -+ cpt_obj_setpos(obj, ctx->current_object, ctx); -+ -+ return 0; -+} -+EXPORT_SYMBOL(cpt_open_object); -+ -+int cpt_push_object(loff_t *saved, struct cpt_context *ctx) -+{ -+ if (ctx->file) { -+ *saved = ctx->current_object; -+ ctx->current_object = ctx->file->f_pos; -+ } -+ return 0; -+} -+EXPORT_SYMBOL(cpt_push_object); -+ -+int cpt_pop_object(loff_t *saved, struct cpt_context *ctx) -+{ -+ ctx->current_object = *saved; -+ return 0; -+} -+EXPORT_SYMBOL(cpt_pop_object); -+ -+int cpt_dump_tail(struct cpt_context *ctx) -+{ -+ struct cpt_major_tail hdr; -+ int i; -+ -+ if (ctx->file == NULL) -+ return 0; -+ -+ cpt_open_section(ctx, CPT_SECT_TRAILER); -+ memset(&hdr, 0, sizeof(hdr)); -+ hdr.cpt_next = sizeof(hdr); -+ hdr.cpt_object = CPT_OBJ_TRAILER; -+ hdr.cpt_hdrlen = sizeof(hdr); -+ hdr.cpt_content = CPT_CONTENT_VOID; -+ hdr.cpt_lazypages = 0; -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ hdr.cpt_lazypages = ctx->lazypages; -+#endif -+ hdr.cpt_64bit = ctx->tasks64; -+ hdr.cpt_signature[0] = CPT_SIGNATURE0; -+ hdr.cpt_signature[1] = CPT_SIGNATURE1; -+ hdr.cpt_signature[2] = CPT_SIGNATURE2; -+ hdr.cpt_signature[3] = CPT_SIGNATURE3; -+ hdr.cpt_nsect = CPT_SECT_MAX_INDEX; -+ for (i = 0; i < CPT_SECT_MAX_INDEX; i++) -+ hdr.cpt_sections[i] = ctx->sections[i]; -+ -+ ctx->write(&hdr, sizeof(hdr), ctx); -+ cpt_close_section(ctx); -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_context.h linux-2.6.16-026test009/kernel/cpt/cpt_context.h ---- linux-2.6.16.orig/kernel/cpt/cpt_context.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_context.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,196 @@ -+#include <linux/fs.h> -+#include <asm/uaccess.h> -+ -+#define CPT_CTX_ERROR -1 -+#define CPT_CTX_IDLE 0 -+#define CPT_CTX_SUSPENDING 1 -+#define CPT_CTX_SUSPENDED 2 -+#define CPT_CTX_DUMPING 3 -+#define CPT_CTX_UNDUMPING 4 -+#define CPT_CTX_UNDUMPED 5 -+ -+#define CPT_TID(tsk) (tsk)->pid, virt_pid(tsk), (tsk)->comm -+#define CPT_FID "%d,%d(%s)" -+ -+ -+typedef struct cpt_context -+{ -+ struct list_head ctx_list; -+ int refcount; -+ int ctx_state; -+ int objcount; -+ int sticky; -+ struct semaphore main_sem; -+ -+ struct file *errorfile; -+ struct file *statusfile; -+ struct file *lockfile; -+ -+ int errno; -+ char *error_msg; -+ loff_t err_offset; -+ -+ struct file *file; -+ char *tmpbuf; -+ int pagesize; -+ -+ loff_t current_section; -+ loff_t current_object; -+ -+ loff_t sections[CPT_SECT_MAX]; -+ -+ __u32 errormask; -+ __u32 write_error; -+ -+ struct list_head object_array[CPT_OBJ_MAX]; -+ -+ void (*write)(const void *addr, size_t count, struct cpt_context *ctx); -+ void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); -+ ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx); -+ ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); -+ void (*align)(struct cpt_context *ctx); -+ int ve_id; -+ int contextid; -+ __u64 cpt_jiffies64; /* Host jiffies64 at the moment of cpt/rst, -+ * corresponging to start_time */ -+ __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when -+ * VE did not migrate. */ -+ struct timespec start_time; -+ struct timespec delta_time; -+ int image_version; -+ int lo_index; -+ int lo_index_old; -+ int venet_index; -+ int venet_index_old; -+ __u64 iptables_mask; -+ -+#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9) -+#define CPT_ANONVMA_HSIZE (1<<CPT_ANONVMA_HBITS) -+ struct hlist_head *anonvmas; -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ struct file *pagein_file_in; -+ struct file *pagein_file_out; -+ int lazy_vm; -+ int lazypages; -+ int lazytype; -+ task_t *pgin_task; -+ unsigned long last_pagein; -+ struct pagein_desc **pgin_dir; -+ struct pgin_device *pagein_dev; -+ struct completion pgin_notify; -+ struct completion *pgind_completion; -+ struct swap_info_struct *pgin_swp; -+#endif -+ int tasks64; -+ __u32 src_cpu_flags; -+ __u32 dst_cpu_flags; -+ __u32 kernel_config_flags; -+ -+ struct filejob *filejob_queue; -+} cpt_context_t; -+ -+typedef struct { -+ int pid; -+ cpt_context_t *ctx; -+ struct completion done; -+} pagein_info_t; -+ -+int pagein_info_printf(char *buf, cpt_context_t *ctx); -+ -+int cpt_open_dumpfile(struct cpt_context *); -+int cpt_close_dumpfile(struct cpt_context *); -+int rst_open_dumpfile(struct cpt_context *); -+void rst_close_dumpfile(struct cpt_context *); -+void cpt_context_init(struct cpt_context *); -+void rst_context_init(struct cpt_context *); -+void cpt_context_destroy(struct cpt_context *); -+ -+void rst_report_error(int err, cpt_context_t *ctx); -+ -+ -+int cpt_major_hdr_out(struct cpt_context *ctx); -+int cpt_dump_tail(struct cpt_context *ctx); -+int cpt_close_section(struct cpt_context *ctx); -+int cpt_open_section(struct cpt_context *ctx, __u32 type); -+int cpt_close_object(struct cpt_context *ctx); -+int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx); -+int cpt_push_object(loff_t *saved, struct cpt_context *ctx); -+int cpt_pop_object(loff_t *saved, struct cpt_context *ctx); -+ -+int rst_get_section(int type, struct cpt_context * ctx, loff_t *, loff_t *); -+__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx); -+__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx); -+void rst_put_name(__u8 *name, struct cpt_context *ctx); -+int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx); -+void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx); -+ -+#define rst_get_object(type, pos, tmp, ctx) \ -+ _rst_get_object((type), (pos), (tmp), sizeof(*(tmp)), (ctx)) -+ -+extern int debug_level; -+ -+#define cpt_printk(lvl, fmt, args...) do { \ -+ if (lvl <= debug_level) \ -+ printk(fmt, ##args); \ -+ } while (0) -+ -+#define dprintk(a...) cpt_printk(3, "CPT DBG: " a) -+#define dprintk_ctx(f, arg...) dprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) -+ -+#define wprintk(a...) cpt_printk(2, "CPT WRN: " a) -+#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) -+ -+#define eprintk(a...) cpt_printk(1, "CPT ERR: " a) -+#define eprintk_ctx(f, arg...) \ -+do { \ -+ eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \ -+ if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \ -+ ctx->err_offset += snprintf((char*)(ctx->error_msg + \ -+ ctx->err_offset), \ -+ PAGE_SIZE - ctx->err_offset, f, ##arg); \ -+} while(0) -+ -+#define CPT_TMPBUF_FREE 0x789adf12 -+#define CPT_TMPBUF_BUSY 0xabcd9876 -+ -+static inline void *cpt_get_buf(cpt_context_t *ctx) -+{ -+ void *buf = ctx->tmpbuf; -+ -+ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE); -+ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY; -+ return buf; -+} -+ -+static inline void __cpt_release_buf(cpt_context_t *ctx) -+{ -+ void *buf = ctx->tmpbuf; -+ -+ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; -+} -+ -+static inline void cpt_release_buf(cpt_context_t *ctx) -+{ -+ void *buf = ctx->tmpbuf; -+ -+ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY); -+ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; -+} -+ -+static inline void cpt_flush_error(cpt_context_t *ctx) -+{ -+ mm_segment_t oldfs; -+ -+ if (ctx->errorfile && ctx->error_msg && ctx->err_offset) { -+ if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) { -+ oldfs = get_fs(); -+ set_fs(KERNEL_DS); -+ ctx->errorfile->f_op->write(ctx->errorfile, -+ ctx->error_msg, ctx->err_offset, -+ &ctx->errorfile->f_pos); -+ set_fs(oldfs); -+ } -+ ctx->error_msg[0] = 0; -+ ctx->err_offset = 0; -+ } -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_dump.c linux-2.6.16-026test009/kernel/cpt/cpt_dump.c ---- linux-2.6.16.orig/kernel/cpt/cpt_dump.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_dump.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,838 @@ -+/* -+ * -+ * kernel/cpt/cpt_dump.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/pagemap.h> -+#include <linux/ptrace.h> -+#include <linux/smp_lock.h> -+#include <linux/ve.h> -+#include <linux/ve_proto.h> -+#include <linux/virtinfo.h> -+#include <ub/ub_task.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_dump.h" -+#include "cpt_files.h" -+#include "cpt_mm.h" -+#include "cpt_process.h" -+#include "cpt_net.h" -+#include "cpt_socket.h" -+#include "cpt_ubc.h" -+#include "cpt_kernel.h" -+ -+ -+static int vps_child_level(task_t *root, task_t *c) -+{ -+ int level = 0; -+ int veid = VE_TASK_INFO(c)->owner_env->veid; -+ -+ while (VE_TASK_INFO(c)->owner_env->veid == veid) { -+ if (c->pid != c->tgid) -+ c = c->group_leader; -+ if (c == root) -+ return level; -+ -+ c = c->real_parent; -+ level++; -+ } -+ return -1; -+} -+ -+static inline int freezable(struct task_struct * p) -+{ -+ if (p->exit_state) -+ return 0; -+ -+ switch (p->state) { -+ case EXIT_ZOMBIE: -+ case EXIT_DEAD: -+ case TASK_STOPPED: -+#if TASK_TRACED != TASK_STOPPED -+ case TASK_TRACED: -+#endif -+ return 0; -+ default: -+ return 1; -+ } -+} -+ -+/* -+ * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE... -+ * -+ * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context -+ * of another process. Apparently, it is unacceptable on SMP. -+ * Let's take freeze_processes() in kernel/power/process.c as an example. -+ * Unserialized modifications tsk->flags easily -+ * (believe or not, but it happens with probability of almost 100% :-)) -+ * creates the situation when setting PF_FREEZE in freeze_processes(), -+ * which quickly spins raising PF_FREEZE of all the processes, -+ * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks. -+ * -+ * So, to make things clean, we require that those flags may be modified -+ * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE -+ * is just a kind of signal. -+ * -+ * It is not enough, because we are still not allowed to change tsk->flags -+ * in context of another process, we can corrupt another flags, when the process -+ * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags, -+ * which can be changed atomically. -+ * -+ * PF_FROZEN also changes in context of another process, but this happens -+ * only when the process is already in refrigerator() which does not modify -+ * tsk->flags. -+ */ -+ -+static int vps_stop_tasks(struct cpt_context *ctx) -+{ -+ unsigned long start_time = jiffies; -+ int err; -+ task_t *p, *g; -+ int todo; -+ int round = 0; -+ -+ do_gettimespec(&ctx->start_time); -+ ctx->cpt_jiffies64 = get_jiffies_64(); -+ ctx->virt_jiffies64 = ctx->cpt_jiffies64 + get_exec_env()->jiffies_fixup; -+ -+ read_lock(&tasklist_lock); -+ for(;;) { -+ task_t *root; -+ todo = 0; -+ -+ root = find_task_by_pid_ve(1); -+ if (!root) { -+ read_unlock(&tasklist_lock); -+ eprintk_ctx("cannot find ve init\n"); -+ return -ESRCH; -+ } -+ -+ do_each_thread_ve(g, p) { -+ if (vps_child_level(root, p) >= 0) { -+ if (!is_virtual_pid(virt_pid(p))) { -+ eprintk_ctx("external process %d/%d(%s) inside VPS (e.g. vzctl enter or vzctl exec).\n", virt_pid(p), p->pid, p->comm); -+ todo = -1; -+ goto out; -+ } -+ if (p->vfork_done) { -+ /* Task between vfork()...exec() -+ * cannot be frozen, because parent -+ * wait in uninterruptible state. -+ * So, we do nothing, waiting for -+ * exec(), unless: -+ */ -+ if (p->state == TASK_STOPPED || -+ p->state == TASK_TRACED) { -+ eprintk_ctx("task %d/%d(%s) is stopped while vfork(). Checkpointing is impossible.\n", virt_pid(p), p->pid, p->comm); -+ todo = -1; -+ /* It is fatal, _user_ stopped -+ * vfork()ing task, so that we -+ * cannot suspend now. -+ */ -+ } else { -+ todo = -3; -+ } -+ goto out; -+ } -+ if (p->state == TASK_TRACED -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) -+ && !p->stopped_state -+#endif -+ ) { -+ int ptrace_id = p->pn_state; -+ /* Debugger waits for signal. */ -+ switch (ptrace_id) { -+ case PN_STOP_TF: -+ case PN_STOP_TF_RT: -+ case PN_STOP_ENTRY: -+ case PN_STOP_FORK: -+ case PN_STOP_VFORK: -+ case PN_STOP_SIGNAL: -+ case PN_STOP_EXIT: -+ case PN_STOP_LEAVE: -+ break; -+ default: -+ eprintk_ctx("task %d/%d(%s) is stopped by debugger while %d.\n", virt_pid(p), p->pid, p->comm, ptrace_id); -+ todo = -1; -+ goto out; -+ } -+ } -+ if (p->flags & PF_NOFREEZE) -+ goto out; -+ if (p->flags & PF_FROZEN) -+ continue; -+ if (!freezable(p)) -+ continue; -+ -+ spin_lock_irq(&p->sighand->siglock); -+ set_tsk_thread_flag(p, TIF_FREEZE); -+ signal_wake_up(p, 0); -+ spin_unlock_irq(&p->sighand->siglock); -+ -+ if (round == 10) -+ wprintk_ctx("%d/%d(%s) is running\n", virt_pid(p), p->pid, p->comm); -+ -+ todo++; -+ } else { -+ if (p != current) { -+ eprintk_ctx("foreign process %d/%d(%s) inside VPS (e.g. vzctl enter or vzctl exec).\n", virt_pid(p), p->pid, p->comm); -+ todo = -1; -+ goto out; -+ } -+ } -+ } while_each_thread_ve(g, p); -+ -+out: -+ if (todo && -+ (time_after(jiffies, start_time + 10*HZ) || -+ signal_pending(current) || todo < 0)) { -+ do_each_thread_ve(g, p) { -+ if (vps_child_level(root, p) >= 0) { -+ spin_lock_irq(&p->sighand->siglock); -+ clear_tsk_thread_flag(p, TIF_FREEZE); -+ if (p->flags & PF_FROZEN) { -+ p->flags &= ~PF_FROZEN; -+ wake_up_process(p); -+ } -+ spin_unlock_irq(&p->sighand->siglock); -+ } -+ } while_each_thread_ve(g, p); -+ if (todo > 0) -+ todo = -2; -+ /* This is sign of failure of printk(), which is not -+ * ours. So, no prefixes. */ -+ printk(">\n"); -+ } -+ -+ read_unlock(&tasklist_lock); -+ -+ if (!todo) -+ return 0; -+ -+ if (todo == -1) { -+ eprintk_ctx("suspend is impossible now.\n"); -+ return -EAGAIN; -+ } -+ -+ if (todo == -2) { -+ eprintk_ctx("interrupted or timed out.\n"); -+ return -EINTR; -+ } -+ -+ if (time_after(jiffies, start_time + 10*HZ) || -+ signal_pending(current)) { -+ if (todo == -3) { -+ eprintk_ctx("vfork() is active, suspend is impossible now.\n"); -+ } else { -+ eprintk_ctx("suspend is impossible, reason %d\n", todo); -+ } -+ return -EAGAIN; -+ } -+ -+ if (todo < 0 || round > 0) { -+ current->state = TASK_INTERRUPTIBLE; -+ schedule_timeout(HZ/50); -+ } else { -+ yield(); -+ } -+ -+ read_lock(&tasklist_lock); -+ round++; -+ } -+ -+ read_unlock(&tasklist_lock); -+ return err; -+} -+ -+static int cpt_unlock_ve(struct cpt_context *ctx) -+{ -+ struct ve_struct *env; -+ -+ env = get_ve_by_id(ctx->ve_id); -+ if (!env) -+ return -ESRCH; -+ down_write(&env->op_sem); -+ env->is_locked = 0; -+ up_write(&env->op_sem); -+ put_ve(env); -+ return 0; -+} -+ -+int cpt_resume(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ cpt_unlock_sockets(ctx); -+ -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ if (ctx->pgin_task) { -+ wait_for_completion(&ctx->pgin_notify); -+ put_task_struct(ctx->pgin_task); -+ ctx->pgin_task = NULL; -+ } -+#endif -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ -+ spin_lock_irq(&tsk->sighand->siglock); -+ clear_tsk_thread_flag(tsk, TIF_FREEZE); -+ if (tsk->flags & PF_FROZEN) { -+ tsk->flags &= ~PF_FROZEN; -+ wake_up_process(tsk); -+ } else if (freezable(tsk)) { -+ eprintk_ctx("strange, %s not frozen\n", tsk->comm ); -+ } -+ spin_unlock_irq(&tsk->sighand->siglock); -+ put_task_struct(tsk); -+ } -+ -+ cpt_resume_network(ctx); -+ -+ cpt_unlock_ve(ctx); -+ -+ cpt_finish_ubc(ctx); -+ cpt_object_destroy(ctx); -+ return 0; -+} -+ -+int cpt_kill(struct cpt_context *ctx) -+{ -+ int err = 0; -+ struct ve_struct *env; -+ cpt_object_t *obj; -+ task_t *root_task = NULL; -+ long delay; -+ -+ if (!ctx->ve_id) -+ return -EINVAL; -+ -+ env = get_ve_by_id(ctx->ve_id); -+ if (!env) -+ return -ESRCH; -+ -+ /* from here cpt_kill succeeds */ -+ if (VE_TASK_INFO(current)->owner_env == env) { -+ wprintk_ctx("attempt to kill ve from inside, escaping...\n"); -+ -+ write_lock_irq(&tasklist_lock); -+ VE_TASK_INFO(current)->owner_env = get_ve0(); -+ REMOVE_VE_LINKS(current); -+ SET_VE_LINKS(current); -+ -+ atomic_inc(&get_ve0()->pcounter); -+ atomic_dec(&env->pcounter); -+ write_unlock_irq(&tasklist_lock); -+ set_exec_env(get_ve0()); -+ } -+ -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ if (ctx->pgin_task) { -+ wait_for_completion(&ctx->pgin_notify); -+ put_task_struct(ctx->pgin_task); -+ ctx->pgin_task = NULL; -+ } -+#endif -+ -+ cpt_kill_sockets(ctx); -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ -+ if (tsk->exit_state) { -+ put_task_struct(tsk); -+ continue; -+ } -+ -+ if (virt_pid(tsk) == 1) { -+ root_task = tsk; -+ continue; -+ } -+ -+ if (tsk->ptrace) { -+ write_lock_irq(&tasklist_lock); -+ tsk->ptrace = 0; -+ if (!list_empty(&tsk->ptrace_list)) { -+ list_del_init(&tsk->ptrace_list); -+ REMOVE_LINKS(tsk); -+ tsk->parent = tsk->real_parent; -+ SET_LINKS(tsk); -+ } -+ write_unlock_irq(&tasklist_lock); -+ } -+ -+ send_sig(SIGKILL, tsk, 1); -+ -+ spin_lock_irq(&tsk->sighand->siglock); -+ sigfillset(&tsk->blocked); -+ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); -+ set_tsk_thread_flag(tsk, TIF_SIGPENDING); -+ clear_tsk_thread_flag(tsk, TIF_FREEZE); -+ if (tsk->flags & PF_FROZEN) -+ tsk->flags &= ~PF_FROZEN; -+ spin_unlock_irq(&tsk->sighand->siglock); -+ -+ wake_up_process(tsk); -+ put_task_struct(tsk); -+ } -+ -+ yield(); -+ -+ if (root_task != NULL) { -+ send_sig(SIGKILL, root_task, 1); -+ -+ spin_lock_irq(&root_task->sighand->siglock); -+ sigfillset(&root_task->blocked); -+ sigdelsetmask(&root_task->blocked, sigmask(SIGKILL)); -+ set_tsk_thread_flag(root_task, TIF_SIGPENDING); -+ clear_tsk_thread_flag(root_task, TIF_FREEZE); -+ if (root_task->flags & PF_FROZEN) -+ root_task->flags &= ~PF_FROZEN; -+ spin_unlock_irq(&root_task->sighand->siglock); -+ -+ wake_up_process(root_task); -+ put_task_struct(root_task); -+ } -+ -+ cpt_finish_ubc(ctx); -+ cpt_object_destroy(ctx); -+ -+ delay = 1; -+ while (atomic_read(&env->counter) != 1) { -+ if (signal_pending(current)) -+ break; -+ current->state = TASK_INTERRUPTIBLE; -+ delay = (delay < HZ) ? (delay << 1) : HZ; -+ schedule_timeout(delay); -+ } -+ put_ve(env); -+ -+ return err; -+} -+ -+static void collect_task_ubc(task_t *t, struct cpt_context *ctx) -+{ -+ struct task_beancounter *tbc; -+ -+ tbc = &(t->task_bc); -+ cpt_add_ubc(tbc->exec_ub, ctx); -+ cpt_add_ubc(tbc->task_ub, ctx); -+ cpt_add_ubc(tbc->fork_sub, ctx); -+} -+ -+static cpt_object_t * remember_task(task_t * child, cpt_object_t * head, -+ cpt_context_t * ctx) -+{ -+ cpt_object_t *cobj; -+ -+ if (freezable(child) && !(child->flags&PF_FROZEN)) { -+ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child)); -+ put_task_struct(child); -+ return NULL; -+ } -+ -+ if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG(); -+ if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { -+ put_task_struct(child); -+ return NULL; -+ } -+ cobj->o_count = 1; -+ cpt_obj_setobj(cobj, child, ctx); -+ insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx); -+ collect_task_ubc(child, ctx); -+ return cobj; -+} -+ -+static int vps_collect_tasks(struct cpt_context *ctx) -+{ -+ int err = -ESRCH; -+ cpt_object_t *obj; -+ task_t *root; -+ -+ read_lock(&tasklist_lock); -+ root = find_task_by_pid_ve(1); -+ if (root) -+ get_task_struct(root); -+ read_unlock(&tasklist_lock); -+ -+ if (!root) { -+ err = -ESRCH; -+ eprintk_ctx("vps_collect_tasks: cannot find root\n"); -+ goto out; -+ } -+ -+ if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { -+ put_task_struct(root); -+ return -ENOMEM; -+ } -+ obj->o_count = 1; -+ cpt_obj_setobj(obj, root, ctx); -+ intern_cpt_object(CPT_OBJ_TASK, obj, ctx); -+ collect_task_ubc(root, ctx); -+ -+ /* Collect process subtree recursively */ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ cpt_object_t *head = obj; -+ task_t *tsk = obj->o_obj; -+ task_t *child; -+ -+ if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) { -+ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk)); -+ err = -EINVAL; -+ goto out; -+ } -+ -+ wait_task_inactive(tsk); -+ -+ if (tsk->pid == tsk->tgid) { -+ child = tsk; -+ for (;;) { -+ read_lock(&tasklist_lock); -+ child = next_thread(child); -+ if (child != tsk) -+ get_task_struct(child); -+ read_unlock(&tasklist_lock); -+ -+ if (child == tsk) -+ break; -+ -+ if (child->real_parent != tsk->real_parent) { -+ put_task_struct(child); -+ eprintk_ctx("illegal thread structure, kernel bug\n"); -+ return -EINVAL; -+ } -+ -+ if ((head = remember_task(child, head, ctx)) == NULL) -+ return -ENOMEM; -+ } -+ } -+ -+ /* About locking. VE is frozen. But lists of children -+ * may change at least for init, when entered task reparents -+ * to init and when reparented task exits. If we take care -+ * of this case, we still can unlock while scanning -+ * tasklists. -+ */ -+ read_lock(&tasklist_lock); -+ list_for_each_entry(child, &tsk->children, sibling) { -+ if (child->real_parent != tsk) -+ continue; -+ if (child->pid != child->tgid) -+ continue; -+ get_task_struct(child); -+ read_unlock(&tasklist_lock); -+ -+ if ((head = remember_task(child, head, ctx)) == NULL) -+ return -ENOMEM; -+ -+ read_lock(&tasklist_lock); -+ } -+ -+ list_for_each_entry(child, &tsk->ptrace_children, ptrace_list) { -+ if (child->real_parent != tsk) -+ continue; -+ if (child->pid != child->tgid) -+ continue; -+ get_task_struct(child); -+ read_unlock(&tasklist_lock); -+ -+ if ((head = remember_task(child, head, ctx)) == NULL) -+ return -ENOMEM; -+ -+ read_lock(&tasklist_lock); -+ } -+ read_unlock(&tasklist_lock); -+ } -+ -+ return 0; -+ -+out: -+ return err; -+} -+ -+static int cpt_collect(struct cpt_context *ctx) -+{ -+ int err; -+ -+ if ((err = cpt_collect_mm(ctx)) != 0) -+ return err; -+ -+ if ((err = cpt_collect_sysv(ctx)) != 0) -+ return err; -+ -+ if ((err = cpt_collect_files(ctx)) != 0) -+ return err; -+ -+ if ((err = cpt_collect_fs(ctx)) != 0) -+ return err; -+ -+ if ((err = cpt_collect_namespace(ctx)) != 0) -+ return err; -+ -+ if ((err = cpt_collect_signals(ctx)) != 0) -+ return err; -+ -+ return 0; -+} -+ -+static int cpt_dump_veinfo(cpt_context_t *ctx) -+{ -+ struct cpt_veinfo_image i; -+ struct ve_struct *ve; -+ struct timespec delta; -+ -+ cpt_open_section(ctx, CPT_SECT_VEINFO); -+ cpt_open_object(NULL, ctx); -+ -+ i.cpt_next = CPT_NULL; -+ i.cpt_object = CPT_OBJ_VEINFO; -+ i.cpt_hdrlen = sizeof(i); -+ i.cpt_content = CPT_CONTENT_VOID; -+ -+ ve = get_exec_env(); -+ i.shm_ctl_all = ve->_shm_ctlall; -+ i.shm_ctl_max = ve->_shm_ctlmax; -+ i.shm_ctl_mni = ve->_shm_ctlmni; -+ -+ i.msg_ctl_max = ve->_msg_ctlmax; -+ i.msg_ctl_mni = ve->_msg_ctlmni; -+ i.msg_ctl_mnb = ve->_msg_ctlmnb; -+ -+ BUG_ON(sizeof(ve->_sem_ctls) != sizeof(i.sem_ctl_arr)); -+ i.sem_ctl_arr[0] = ve->_sem_ctls[0]; -+ i.sem_ctl_arr[1] = ve->_sem_ctls[1]; -+ i.sem_ctl_arr[2] = ve->_sem_ctls[2]; -+ i.sem_ctl_arr[3] = ve->_sem_ctls[3]; -+ -+ do_posix_clock_monotonic_gettime(&delta); -+ _set_normalized_timespec(&delta, -+ delta.tv_sec - ve->start_timespec.tv_sec, -+ delta.tv_nsec - ve->start_timespec.tv_nsec); -+ i.start_timespec_delta = cpt_timespec_export(&delta); -+ i.start_jiffies_delta = get_jiffies_64() - ve->start_jiffies; -+ -+ ctx->write(&i, sizeof(i), ctx); -+ cpt_close_object(ctx); -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+static int cpt_dump_utsname(cpt_context_t *ctx) -+{ -+ int len; -+ struct cpt_object_hdr o; -+ -+ cpt_open_section(ctx, CPT_SECT_UTSNAME); -+ -+ len = strlen(ve_utsname.nodename); -+ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1); -+ o.cpt_object = CPT_OBJ_NAME; -+ o.cpt_hdrlen = sizeof(o); -+ o.cpt_content = CPT_CONTENT_NAME; -+ -+ ctx->write(&o, sizeof(o), ctx); -+ ctx->write(ve_utsname.nodename, len+1, ctx); -+ ctx->align(ctx); -+ -+ len = strlen(ve_utsname.domainname); -+ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1); -+ o.cpt_object = CPT_OBJ_NAME; -+ o.cpt_hdrlen = sizeof(o); -+ o.cpt_content = CPT_CONTENT_NAME; -+ -+ ctx->write(&o, sizeof(o), ctx); -+ ctx->write(ve_utsname.domainname, len+1, ctx); -+ ctx->align(ctx); -+ -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+int cpt_dump(struct cpt_context *ctx) -+{ -+ struct ve_struct *oldenv, *env; -+ int err, err2 = 0; -+ -+ if (!ctx->ve_id) -+ return -EINVAL; -+ -+ env = get_ve_by_id(ctx->ve_id); -+ if (!env) -+ return -ESRCH; -+ -+ down_read(&env->op_sem); -+ err = -ESRCH; -+ if (!env->is_running) -+ goto out_noenv; -+ if (!env->is_locked) -+ goto out_noenv; -+ -+ oldenv = set_exec_env(env); -+ -+ /* Phase 2: real checkpointing */ -+ err = cpt_open_dumpfile(ctx); -+ if (err) -+ goto out; -+ -+ cpt_major_hdr_out(ctx); -+ -+ if (!err) -+ err = cpt_dump_veinfo(ctx); -+ if (!err) -+ err = cpt_dump_ubc(ctx); -+ if (!err) -+ err = cpt_dump_ifinfo(ctx); -+ if (!err) -+ err = cpt_dump_files(ctx); -+ if (!err) -+ err = cpt_dump_files_struct(ctx); -+ if (!err) -+ err = cpt_dump_fs_struct(ctx); -+ if (!err) -+ err = cpt_dump_namespace(ctx); -+ if (!err) -+ err = cpt_dump_sighand(ctx); -+ if (!err) -+ err = cpt_dump_vm(ctx); -+ if (!err) -+ err = cpt_dump_sysvsem(ctx); -+ if (!err) -+ err = cpt_dump_tasks(ctx); -+ if (!err) -+ err = cpt_dump_orphaned_sockets(ctx); -+#if defined(CONFIG_VE_IPTABLES) && \ -+ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) -+ if (!err) -+ err = cpt_dump_ip_conntrack(ctx); -+#endif -+ if (!err) -+ err = cpt_dump_utsname(ctx); -+ -+ if (!err) -+ err = cpt_dump_tail(ctx); -+ -+ err2 = cpt_close_dumpfile(ctx); -+ -+out: -+ set_exec_env(oldenv); -+out_noenv: -+ up_read(&env->op_sem); -+ put_ve(env); -+ return err ? : err2; -+} -+ -+int cpt_vps_suspend(struct cpt_context *ctx) -+{ -+ struct ve_struct *oldenv, *env; -+ int err = 0; -+ -+ ctx->kernel_config_flags = test_kernel_config(); -+ cpt_object_init(ctx); -+ -+ if (!ctx->ve_id) { -+ env = get_exec_env(); -+ if (env == get_ve0()) -+ return -EINVAL; -+ wprintk("undefined ve_id\n"); -+ ctx->ve_id = env->veid; -+ get_ve(env); -+ } else { -+ env = get_ve_by_id(ctx->ve_id); -+ if (!env) -+ return -ESRCH; -+ } -+ -+ ctx->iptables_mask = env->_iptables_modules; -+ -+ down_write(&env->op_sem); -+ err = -ESRCH; -+ if (!env->is_running) -+ goto out_noenv; -+ -+ err = -EBUSY; -+ if (env->is_locked) -+ goto out_noenv; -+ env->is_locked = 1; -+ downgrade_write(&env->op_sem); -+ -+ oldenv = set_exec_env(env); -+ -+ /* Phase 0: find and stop all the tasks */ -+ if ((err = vps_stop_tasks(ctx)) != 0) -+ goto out; -+ -+ if ((err = cpt_suspend_network(ctx)) != 0) -+ goto out; -+ -+ /* At the moment all the state is frozen. We do not need to lock -+ * the state, which can be changed only if the tasks are running. -+ */ -+ -+ /* Phase 1: collect task tree */ -+ if ((err = vps_collect_tasks(ctx)) != 0) -+ goto out; -+ -+ /* Phase 1': collect all the resources */ -+ if ((err = cpt_collect(ctx)) != 0) -+ goto out; -+ -+out: -+ set_exec_env(oldenv); -+ up_read(&env->op_sem); -+ put_ve(env); -+ return err; -+ -+out_noenv: -+ up_write(&env->op_sem); -+ put_ve(env); -+ return err; -+} -+ -+int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps) -+{ -+ task_t *p; -+ struct ve_struct *env; -+ unsigned int flags = test_cpu_caps(); -+ -+ if (!ctx->ve_id) -+ return -EINVAL; -+ -+ env = get_ve_by_id(ctx->ve_id); -+ if (env == NULL) -+ return -ESRCH; -+ -+ *caps = flags & (1<<CPT_CPU_X86_CMOV); -+ flags &= ~((1<<CPT_CPU_X86_EMT64)|(1<<CPT_CPU_X86_IA64)); -+ -+ read_lock(&tasklist_lock); -+ for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p)) { -+ if (tsk_used_math(p)) -+ *caps |= flags; -+#ifdef CONFIG_X86_64 -+ if (!(p->thread_info->flags & _TIF_IA32)) -+ *caps |= (1<<CPT_CPU_X86_EMT64); -+#endif -+ } -+ read_unlock(&tasklist_lock); -+ put_ve(env); -+ -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_dump.h linux-2.6.16-026test009/kernel/cpt/cpt_dump.h ---- linux-2.6.16.orig/kernel/cpt/cpt_dump.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_dump.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,14 @@ -+int cpt_dump(struct cpt_context *cpt); -+int rst_undump(struct cpt_context *cpt); -+int cpt_suspend(struct cpt_context *cpt); -+int cpt_resume(struct cpt_context *cpt); -+int cpt_kill(struct cpt_context *cpt); -+int rst_clean(struct cpt_context *cpt); -+int rst_resume(struct cpt_context *cpt); -+int rst_kill(struct cpt_context *cpt); -+ -+int cpt_freeze_one(pid_t pid, int freeze); -+int cpt_vps_suspend(struct cpt_context *ctx); -+int vps_rst_undump(struct cpt_context *ctx); -+ -+int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_epoll.c linux-2.6.16-026test009/kernel/cpt/cpt_epoll.c ---- linux-2.6.16.orig/kernel/cpt/cpt_epoll.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_epoll.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,116 @@ -+/* -+ * -+ * kernel/cpt/cpt_epoll.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/major.h> -+#include <linux/pipe_fs_i.h> -+#include <linux/mman.h> -+#include <linux/namespace.h> -+#include <linux/mount.h> -+#include <linux/namei.h> -+#include <linux/smp_lock.h> -+#include <asm/uaccess.h> -+#include <linux/vzcalluser.h> -+#include <linux/eventpoll.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_files.h" -+#include "cpt_kernel.h" -+#include "cpt_fsmagic.h" -+#include "cpt_syscalls.h" -+ -+extern struct file_operations eventpoll_fops; -+ -+int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx) -+{ -+ int err = 0; -+ struct file *file = obj->o_obj; -+ struct eventpoll *ep; -+ struct rb_node *rbp; -+ struct cpt_epoll_image ei; -+ -+ if (file->f_op != &eventpoll_fops) { -+ eprintk_ctx("bad epoll file\n"); -+ return -EINVAL; -+ } -+ -+ ep = file->private_data; -+ -+ /* eventpoll.c does not protect open /proc/N/fd, silly. -+ * Opener will get an invalid file with uninitialized private_data -+ */ -+ if (unlikely(ep == NULL)) { -+ eprintk_ctx("bad epoll device\n"); -+ return -EINVAL; -+ } -+ -+ cpt_open_object(NULL, ctx); -+ -+ ei.cpt_next = CPT_NULL; -+ ei.cpt_object = CPT_OBJ_EPOLL; -+ ei.cpt_hdrlen = sizeof(ei); -+ ei.cpt_content = CPT_CONTENT_ARRAY; -+ ei.cpt_file = obj->o_pos; -+ -+ ctx->write(&ei, sizeof(ei), ctx); -+ -+ down(&epsem); -+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { -+ loff_t saved_obj; -+ cpt_object_t *tobj; -+ struct cpt_epoll_file_image efi; -+ struct epitem *epi; -+ epi = rb_entry(rbp, struct epitem, rbn); -+ tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx); -+ if (tobj == NULL) { -+ eprintk_ctx("epoll device refers to an external file\n"); -+ err = -EBUSY; -+ break; -+ } -+ cpt_push_object(&saved_obj, ctx); -+ cpt_open_object(NULL, ctx); -+ -+ efi.cpt_next = CPT_NULL; -+ efi.cpt_object = CPT_OBJ_EPOLL_FILE; -+ efi.cpt_hdrlen = sizeof(efi); -+ efi.cpt_content = CPT_CONTENT_VOID; -+ efi.cpt_file = tobj->o_pos; -+ efi.cpt_fd = epi->ffd.fd; -+ efi.cpt_events = epi->event.events; -+ efi.cpt_data = epi->event.data; -+ efi.cpt_revents = epi->revents; -+ efi.cpt_ready = 0; -+ if (!list_empty(&epi->rdllink)) -+ efi.cpt_ready = 1; -+ -+ ctx->write(&efi, sizeof(efi), ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ } -+ up(&epsem); -+ -+ cpt_close_object(ctx); -+ -+ return err; -+} -+ -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_files.c linux-2.6.16-026test009/kernel/cpt/cpt_files.c ---- linux-2.6.16.orig/kernel/cpt/cpt_files.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_files.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,1343 @@ -+/* -+ * -+ * kernel/cpt/cpt_files.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/major.h> -+#include <linux/pipe_fs_i.h> -+#include <linux/mman.h> -+#include <linux/namespace.h> -+#include <linux/mount.h> -+#include <linux/namei.h> -+#include <linux/smp_lock.h> -+#include <linux/pagemap.h> -+#include <asm/uaccess.h> -+#include <linux/vzcalluser.h> -+#include <linux/ve_proto.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_files.h" -+#include "cpt_socket.h" -+#include "cpt_kernel.h" -+#include "cpt_fsmagic.h" -+#include "cpt_syscalls.h" -+ -+void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt) -+{ -+ char *path; -+ unsigned long pg = __get_free_page(GFP_KERNEL); -+ -+ if (!pg) -+ return; -+ -+ path = d_path(d, mnt, (char *)pg, PAGE_SIZE); -+ -+ if (!IS_ERR(path)) -+ printk("<%s>", path); -+ free_page(pg); -+} -+ -+int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, -+ cpt_context_t *ctx) -+{ -+ if (path[0] == '/' && !IS_ROOT(d) && !d_unhashed(d)) { -+ struct nameidata nd; -+ if (path_lookup(path, 0, &nd)) { -+ eprintk_ctx("d_path cannot be looked up %s\n", path); -+ return -EINVAL; -+ } -+ if (nd.dentry != d || nd.mnt != mnt) { -+ eprintk_ctx("d_path is invisible %s\n", path); -+ path_release(&nd); -+ return -EINVAL; -+ } -+ path_release(&nd); -+ } -+ return 0; -+} -+ -+int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) -+{ -+ int len; -+ char *path; -+ char *pg = cpt_get_buf(ctx); -+ -+ path = d_path(d, mnt, pg, PAGE_SIZE); -+ len = PTR_ERR(path); -+ -+ if (IS_ERR(path)) { -+ struct cpt_object_hdr o; -+ char tmp[1]; -+ /* VZ changes d_path() to return EINVAL, when path -+ * is not supposed to be visible inside VE. */ -+ if (len != -EINVAL) -+ eprintk_ctx("d_path err=%d\n", len); -+ else -+ len = 0; -+ -+ o.cpt_next = sizeof(o) + CPT_ALIGN(1); -+ o.cpt_object = CPT_OBJ_NAME; -+ o.cpt_hdrlen = sizeof(o); -+ o.cpt_content = CPT_CONTENT_NAME; -+ tmp[0] = 0; -+ -+ ctx->write(&o, sizeof(o), ctx); -+ ctx->write(tmp, 1, ctx); -+ ctx->align(ctx); -+ -+ __cpt_release_buf(ctx); -+ return len; -+ } else { -+ struct cpt_object_hdr o; -+ -+ len = pg + PAGE_SIZE - 1 - path; -+ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1); -+ o.cpt_object = CPT_OBJ_NAME; -+ o.cpt_hdrlen = sizeof(o); -+ o.cpt_content = CPT_CONTENT_NAME; -+ path[len] = 0; -+ -+ if (cpt_verify_overmount(path, d, mnt, ctx)) { -+ __cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ -+ ctx->write(&o, sizeof(o), ctx); -+ ctx->write(path, len+1, ctx); -+ ctx->align(ctx); -+ __cpt_release_buf(ctx); -+ } -+ return 0; -+} -+ -+int cpt_dump_string(const char *s, struct cpt_context *ctx) -+{ -+ int len; -+ struct cpt_object_hdr o; -+ -+ len = strlen(s); -+ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1); -+ o.cpt_object = CPT_OBJ_NAME; -+ o.cpt_hdrlen = sizeof(o); -+ o.cpt_content = CPT_CONTENT_NAME; -+ -+ ctx->write(&o, sizeof(o), ctx); -+ ctx->write(s, len+1, ctx); -+ ctx->align(ctx); -+ return 0; -+} -+ -+int cpt_dump_filename(struct file *file, struct cpt_context *ctx) -+{ -+ return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, ctx); -+} -+ -+int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_inode_image *v = cpt_get_buf(ctx); -+ struct kstat sbuf; -+ -+ v->cpt_next = sizeof(*v); -+ v->cpt_object = CPT_OBJ_INODE; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ -+ v->cpt_dev = d->d_inode->i_sb->s_dev; -+ v->cpt_ino = d->d_inode->i_ino; -+ v->cpt_mode = sbuf.mode; -+ v->cpt_nlink = sbuf.nlink; -+ v->cpt_uid = sbuf.uid; -+ v->cpt_gid = sbuf.gid; -+ v->cpt_rdev = d->d_inode->i_rdev; -+ v->cpt_size = sbuf.size; -+ v->cpt_atime = cpt_timespec_export(&sbuf.atime); -+ v->cpt_mtime = cpt_timespec_export(&sbuf.mtime); -+ v->cpt_ctime = cpt_timespec_export(&sbuf.ctime); -+ v->cpt_blksize = sbuf.blksize; -+ v->cpt_blocks = sbuf.blocks; -+ v->cpt_sb = d->d_inode->i_sb->s_magic; -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ return 0; -+} -+ -+int cpt_collect_files(cpt_context_t * ctx) -+{ -+ int err; -+ cpt_object_t *obj; -+ int index = 0; -+ -+ /* Collect process fd sets */ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL) -+ return -ENOMEM; -+ } -+ -+ /* Collect files from fd sets */ -+ for_each_object(obj, CPT_OBJ_FILES) { -+ int fd; -+ struct files_struct *f = obj->o_obj; -+ -+ cpt_obj_setindex(obj, index++, ctx); -+ -+ if (obj->o_count != atomic_read(&f->count)) { -+ eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count)); -+ return -EBUSY; -+ } -+ -+ for (fd = 0; fd < f->fdt->max_fds; fd++) { -+ struct file *file = fcheck_files(f, fd); -+ if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL) -+ return -ENOMEM; -+ } -+ } -+ -+ /* Collect files queued by AF_UNIX sockets. */ -+ if ((err = cpt_collect_passedfds(ctx)) < 0) -+ return err; -+ -+ /* OK. At this point we should count all the references. */ -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file = obj->o_obj; -+ struct file *parent; -+ cpt_object_t *ino_obj; -+ -+ if (obj->o_count != atomic_read(&file->f_count)) { -+ eprintk_ctx("file struct is referenced outside %d %d\n", obj->o_count, atomic_read(&file->f_count)); -+ cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); -+ return -EBUSY; -+ } -+ -+ switch (file->f_dentry->d_inode->i_sb->s_magic) { -+ case FSMAGIC_FUTEX: -+ case FSMAGIC_MQUEUE: -+ case FSMAGIC_BDEV: -+ eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic); -+ return -EBUSY; -+ } -+ -+ /* Collect inode. It is necessary mostly to resolve deleted -+ * hard links. */ -+ ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); -+ if (ino_obj == NULL) -+ return -ENOMEM; -+ -+ parent = ino_obj->o_parent; -+ if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) -+ ino_obj->o_parent = file; -+ -+ if (S_ISCHR(file->f_dentry->d_inode->i_mode)) { -+ int maj = imajor(file->f_dentry->d_inode); -+ if (maj == PTY_MASTER_MAJOR || -+ (maj >= UNIX98_PTY_MASTER_MAJOR && -+ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || -+ maj == PTY_SLAVE_MAJOR || -+ maj == UNIX98_PTY_SLAVE_MAJOR || -+ maj == TTYAUX_MAJOR) { -+ err = cpt_collect_tty(file, ctx); -+ if (err) -+ return err; -+ } -+ } -+ -+ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { -+ err = cpt_collect_socket(file, ctx); -+ if (err) -+ return err; -+ } -+ } -+ -+ err = cpt_index_sockets(ctx); -+ -+ return err; -+} -+ -+/* /dev/ptmx is special, all the files share one inode, but real tty backend -+ * is attached via file->private_data. -+ */ -+ -+static inline int is_cloning_inode(struct inode *ino) -+{ -+ return S_ISCHR(ino->i_mode) && -+ ino->i_rdev == MKDEV(TTYAUX_MAJOR,2); -+} -+ -+static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx) -+{ -+ pid_t pid; -+ struct cpt_flock_image *v = cpt_get_buf(ctx); -+ -+ v->cpt_next = sizeof(*v); -+ v->cpt_object = CPT_OBJ_FLOCK; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_VOID; -+ -+ v->cpt_owner = owner; -+ -+ pid = fl->fl_pid; -+ if (pid && !is_virtual_pid(fl->fl_pid)) { -+ pid = _pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); -+ if (pid == -1) { -+ if (!(fl->fl_flags&FL_FLOCK)) { -+ eprintk_ctx("posix lock from another VE?\n"); -+ cpt_release_buf(ctx); -+ return -EBUSY; -+ } -+ pid = 0; -+ } -+ } -+ -+ v->cpt_pid = pid; -+ v->cpt_start = fl->fl_start; -+ v->cpt_end = fl->fl_end; -+ v->cpt_flags = fl->fl_flags; -+ v->cpt_type = fl->fl_type; -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ return 0; -+} -+ -+ -+int cpt_dump_flock(struct file *file, struct cpt_context *ctx) -+{ -+ int err = 0; -+ struct file_lock *fl; -+ -+ lock_kernel(); -+ for (fl = file->f_dentry->d_inode->i_flock; -+ fl; fl = fl->fl_next) { -+ if (file != fl->fl_file) -+ continue; -+ if (fl->fl_flags & FL_LEASE) { -+ eprintk_ctx("lease lock is not supported\n"); -+ err = -EINVAL; -+ break; -+ } -+ if (fl->fl_flags & FL_POSIX) { -+ cpt_object_t *obj; -+ obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx); -+ if (obj) { -+ dump_one_flock(fl, obj->o_index, ctx); -+ continue; -+ } else { -+ eprintk_ctx("unknown lock owner %p\n", fl->fl_owner); -+ err = -EINVAL; -+ } -+ } -+ if (fl->fl_flags & FL_FLOCK) { -+ dump_one_flock(fl, -1, ctx); -+ continue; -+ } -+ } -+ unlock_kernel(); -+ return err; -+} -+ -+static int __comb_pid_to_vpid(int pid) -+{ -+ int vpid = pid; -+ -+ if (pid > 0) { -+ vpid = _pid_type_to_vpid(PIDTYPE_PID, pid); -+ if (unlikely(vpid < 0)) { -+ dprintk("pid %d does not exist amymore.\n", pid); -+ return 0; -+ } -+ } else if (pid < 0) { -+ vpid = _pid_type_to_vpid(PIDTYPE_PGID, -pid); -+ if (unlikely(vpid < 0)) { -+ dprintk("pgid %d does not exist amymore.\n", -pid); -+ return 0; -+ } -+ vpid = -vpid; -+ } -+ return vpid; -+} -+ -+static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx) -+{ -+ int err = 0; -+ cpt_object_t *iobj; -+ struct cpt_file_image *v = cpt_get_buf(ctx); -+ struct kstat sbuf; -+ -+ cpt_open_object(obj, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_FILE; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_flags = file->f_flags; -+ v->cpt_mode = file->f_mode; -+ v->cpt_pos = file->f_pos; -+ v->cpt_uid = file->f_uid; -+ v->cpt_gid = file->f_gid; -+ -+ vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf); -+ -+ v->cpt_i_mode = sbuf.mode; -+ v->cpt_lflags = 0; -+ if (IS_ROOT(file->f_dentry)) -+ v->cpt_lflags |= CPT_DENTRY_ROOT; -+ else if (d_unhashed(file->f_dentry)) -+ v->cpt_lflags |= CPT_DENTRY_DELETED; -+ if (is_cloning_inode(file->f_dentry->d_inode)) -+ v->cpt_lflags |= CPT_DENTRY_CLONING; -+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) -+ v->cpt_lflags |= CPT_DENTRY_PROC; -+ v->cpt_inode = CPT_NULL; -+ iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); -+ if (iobj) -+ v->cpt_inode = iobj->o_pos; -+ v->cpt_priv = CPT_NULL; -+ v->cpt_fown_fd = -1; -+ if (S_ISCHR(v->cpt_i_mode)) { -+ iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx); -+ if (iobj) { -+ v->cpt_priv = iobj->o_pos; -+ if (file->f_flags&FASYNC) -+ v->cpt_fown_fd = cpt_tty_fasync(file, ctx); -+ } -+ } -+ if (S_ISSOCK(v->cpt_i_mode)) { -+ if (obj->o_index < 0) { -+ eprintk_ctx("BUG: no socket index\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ v->cpt_priv = obj->o_index; -+ if (file->f_flags&FASYNC) -+ v->cpt_fown_fd = cpt_socket_fasync(file, ctx); -+ } -+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) { -+ v->cpt_priv = file->f_dentry->d_inode->i_ino; -+ v->cpt_lflags |= CPT_DENTRY_EPOLL; -+ } -+ -+ v->cpt_fown_pid = __comb_pid_to_vpid((int)file->f_owner.pid); -+ v->cpt_fown_uid = file->f_owner.uid; -+ v->cpt_fown_euid = file->f_owner.euid; -+ v->cpt_fown_signo = file->f_owner.signum; -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ if (!S_ISSOCK(v->cpt_i_mode)) { -+ err = cpt_dump_filename(file, ctx); -+ if (err) -+ return err; -+ } -+ -+ if (file->f_dentry->d_inode->i_flock) -+ err = cpt_dump_flock(file, ctx); -+ -+ cpt_close_object(ctx); -+ -+ return err; -+} -+ -+/* About this weird function... Crappy code dealing with SYSV shared memory -+ * defines TMPFS inode and file with f_op doing only mmap. So... -+ * Maybe, this is wrong and leaks something. It is clear access to -+ * SYSV shmem via mmap is quite unusual and impossible from user space. -+ */ -+static int dump_content_shm(struct file *file, struct cpt_context *ctx) -+{ -+ struct cpt_obj_bits *v; -+ loff_t saved_pos; -+ unsigned long addr; -+ -+ addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size, -+ PROT_READ, MAP_SHARED, 0); -+ if (IS_ERR((void*)addr)) -+ return PTR_ERR((void*)addr); -+ -+ cpt_push_object(&saved_pos, ctx); -+ cpt_open_object(NULL, ctx); -+ v = cpt_get_buf(ctx); -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_BITS; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_DATA; -+ v->cpt_size = file->f_dentry->d_inode->i_size; -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx); -+ ctx->align(ctx); -+ do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size); -+ -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_pos, ctx); -+ return 0; -+} -+ -+static int data_is_zero(char *addr, int len) -+{ -+ int i; -+ unsigned long zerolong = 0; -+ -+ for (i=0; i<len/sizeof(unsigned long); i++) { -+ if (((unsigned long*)(addr))[i] != 0) -+ return 0; -+ } -+ i = len % sizeof(unsigned long); -+ if (!i) -+ return 1; -+ return memcmp(addr + len - i, &zerolong, i) == 0; -+} -+ -+ -+static int dump_content_regular(struct file *file, struct cpt_context *ctx) -+{ -+ loff_t saved_pos; -+ loff_t pos = 0; -+ loff_t obj_opened = CPT_NULL; -+ struct cpt_page_block pgb; -+ ssize_t (*do_read)(struct file *, char __user *, size_t, loff_t *); -+ -+ if (file->f_op == NULL) -+ return -EINVAL; -+ -+ if ((do_read = file->f_op->read) == NULL) { -+ if (file->f_op->mmap == NULL) -+ return -EINVAL; -+ if (file->f_dentry->d_inode->i_sb->s_magic != FSMAGIC_TMPFS) { -+ eprintk_ctx("unreadable, but not SYSV SHM file\n"); -+ return -EINVAL; -+ } -+ -+ do_read = file->f_dentry->d_inode->i_fop->read; -+ cpt_dump_content_sysvshm(file, ctx); -+ if (!do_read) { -+ wprintk_ctx("TMPFS is not configured?\n"); -+ return dump_content_shm(file, ctx); -+ } -+ } -+ -+ if (!(file->f_mode & FMODE_READ) || -+ (file->f_flags & O_DIRECT)) { -+ file = dentry_open(dget(file->f_dentry), -+ mntget(file->f_vfsmnt), O_RDONLY); -+ } else { -+ atomic_inc(&file->f_count); -+ } -+ -+ for (;;) { -+ mm_segment_t oldfs; -+ int err; -+ -+ (void)cpt_get_buf(ctx); -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos); -+ set_fs(oldfs); -+ if (err < 0) { -+ eprintk_ctx("dump_content_regular: do_read: %d", err); -+ fput(file); -+ __cpt_release_buf(ctx); -+ return err; -+ } -+ if (err == 0) { -+ __cpt_release_buf(ctx); -+ break; -+ } -+ if (data_is_zero(ctx->tmpbuf, err)) { -+ if (obj_opened != CPT_NULL) { -+ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); -+ ctx->align(ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_pos, ctx); -+ obj_opened = CPT_NULL; -+ } -+ } else { -+ if (obj_opened == CPT_NULL) { -+ cpt_push_object(&saved_pos, ctx); -+ cpt_open_object(NULL, ctx); -+ obj_opened = ctx->file->f_pos; -+ pgb.cpt_next = CPT_NULL; -+ pgb.cpt_object = CPT_OBJ_PAGES; -+ pgb.cpt_hdrlen = sizeof(pgb); -+ pgb.cpt_content = CPT_CONTENT_DATA; -+ pgb.cpt_start = pos - err; -+ pgb.cpt_end = pgb.cpt_start; -+ ctx->write(&pgb, sizeof(pgb), ctx); -+ } -+ ctx->write(ctx->tmpbuf, err, ctx); -+ pgb.cpt_end += err; -+ } -+ __cpt_release_buf(ctx); -+ } -+ -+ fput(file); -+ -+ if (obj_opened != CPT_NULL) { -+ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); -+ ctx->align(ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_pos, ctx); -+ obj_opened = CPT_NULL; -+ } -+ return 0; -+} -+ -+ -+static int dump_content_chrdev(struct file *file, struct cpt_context *ctx) -+{ -+ struct inode *ino = file->f_dentry->d_inode; -+ int maj; -+ -+ maj = imajor(ino); -+ if (maj == MEM_MAJOR) { -+ /* Well, OK. */ -+ return 0; -+ } -+ if (maj == PTY_MASTER_MAJOR || -+ (maj >= UNIX98_PTY_MASTER_MAJOR && -+ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || -+ maj == PTY_SLAVE_MAJOR || -+ maj == UNIX98_PTY_SLAVE_MAJOR || -+ maj == TTYAUX_MAJOR) { -+ return cpt_dump_content_tty(file, ctx); -+ } -+ eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino)); -+ return -EINVAL; -+} -+ -+static int dump_content_blkdev(struct file *file, struct cpt_context *ctx) -+{ -+ struct inode *ino = file->f_dentry->d_inode; -+ -+ /* We are not going to transfer them. */ -+ eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino)); -+ return -EINVAL; -+} -+ -+static int dump_content_fifo(struct file *file, struct cpt_context *ctx) -+{ -+ struct inode *ino = file->f_dentry->d_inode; -+ cpt_object_t *obj; -+ loff_t saved_pos; -+ int readers; -+ int writers; -+ int anon = 0; -+ -+ mutex_lock(PIPE_MUTEX(*ino)); -+ readers = PIPE_READERS(*ino); -+ writers = PIPE_WRITERS(*ino); -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file1 = obj->o_obj; -+ if (file1->f_dentry->d_inode == ino) { -+ if (file1->f_mode & FMODE_READ) -+ readers--; -+ if (file1->f_mode & FMODE_WRITE) -+ writers--; -+ } -+ } -+ mutex_unlock(PIPE_MUTEX(*ino)); -+ if (readers || writers) { -+ struct dentry *dr = file->f_dentry->d_sb->s_root; -+ if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0) -+ anon = 1; -+ -+ if (anon) { -+ eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers); -+ return -EBUSY; -+ } -+ /* If fifo has external readers/writers, we are in troubles. -+ * If the buffer is not empty, we must move its content. -+ * But if the fifo is owned by a service, we cannot do -+ * this. See? -+ * -+ * For now we assume, that if fifo is opened by another -+ * process, we do not own it and, hence, migrate without -+ * data. -+ */ -+ return 0; -+ } -+ -+ /* OK, we must save fifo state. No semaphores required. */ -+ -+ if (ino->i_pipe->nrbufs) { -+ struct cpt_obj_bits *v = cpt_get_buf(ctx); -+ struct pipe_inode_info *info; -+ int count, buf, nrbufs; -+ -+ mutex_lock(PIPE_MUTEX(*ino)); -+ info = ino->i_pipe; -+ count = 0; -+ buf = info->curbuf; -+ nrbufs = info->nrbufs; -+ while (--nrbufs >= 0) { -+ if (!info->bufs[buf].ops->can_merge) { -+ mutex_unlock(PIPE_MUTEX(*ino)); -+ eprintk_ctx("unknown format of pipe buffer\n"); -+ return -EINVAL; -+ } -+ count += info->bufs[buf].len; -+ buf = (buf+1) & (PIPE_BUFFERS-1); -+ } -+ -+ if (!count) { -+ mutex_unlock(PIPE_MUTEX(*ino)); -+ return 0; -+ } -+ -+ cpt_push_object(&saved_pos, ctx); -+ cpt_open_object(NULL, ctx); -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_BITS; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_DATA; -+ v->cpt_size = count; -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ count = 0; -+ buf = info->curbuf; -+ nrbufs = info->nrbufs; -+ while (--nrbufs >= 0) { -+ struct pipe_buffer *b = info->bufs + buf; -+ void * addr = b->ops->map(file, info, b); -+ ctx->write(addr + b->offset, b->len, ctx); -+ b->ops->unmap(info, b); -+ buf = (buf+1) & (PIPE_BUFFERS-1); -+ } -+ -+ mutex_unlock(PIPE_MUTEX(*ino)); -+ -+ ctx->align(ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_pos, ctx); -+ } -+ -+ return 0; -+} -+ -+static int dump_content_socket(struct file *file, struct cpt_context *ctx) -+{ -+ return 0; -+} -+ -+static int dump_one_inode(struct file *file, struct dentry *d, -+ struct vfsmount *mnt, struct cpt_context *ctx) -+{ -+ int err = 0; -+ struct inode *ino = d->d_inode; -+ cpt_object_t *iobj; -+ int dump_it = 0; -+ -+ iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx); -+ if (!iobj) -+ return -EINVAL; -+ -+ if (iobj->o_pos >= 0) -+ return 0; -+ -+ if (!IS_ROOT(d) && d_unhashed(d)) -+ dump_it = 1; -+ if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) { -+ /* One more bug in epoll: invalid inode mode. -+ * What a load of crap... -+ */ -+ if (ino->i_sb->s_magic == FSMAGIC_EPOLL && -+ (ino->i_mode & S_IFMT) == 0) -+ return 0; -+ dump_it = 1; -+ } -+ -+ if (!dump_it) -+ return 0; -+ -+ cpt_open_object(iobj, ctx); -+ cpt_dump_inode(d, mnt, ctx); -+ -+ if (!IS_ROOT(d) && d_unhashed(d)) { -+ struct file *parent; -+ parent = iobj->o_parent; -+ if (!parent || -+ (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) { -+ /* Inode is not deleted, but it does not -+ * have references from inside checkpointed -+ * process group. We have options: -+ * A. Fail, abort checkpointing -+ * B. Proceed. File will be cloned. -+ * A is correct, B is more complicated */ -+ /* Just as a hint where to create deleted file */ -+ if (ino->i_nlink != 0) { -+ eprintk_ctx("deleted reference to existing inode, checkpointing is impossible\n"); -+ return -EBUSY; -+ } -+ } else { -+ /* Refer to _another_ file name. */ -+ err = cpt_dump_filename(parent, ctx); -+ if (err) -+ return err; -+ if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) -+ dump_it = 0; -+ } -+ } -+ if (dump_it) { -+ if (S_ISREG(ino->i_mode)) { -+ if ((err = dump_content_regular(file, ctx)) != 0) { -+ eprintk_ctx("dump_content_regular "); -+ cpt_printk_dentry(d, mnt); -+ } -+ } else if (S_ISDIR(ino->i_mode)) { -+ /* We cannot do anything. The directory should be -+ * empty, so it is not a big deal. -+ */ -+ } else if (S_ISCHR(ino->i_mode)) { -+ err = dump_content_chrdev(file, ctx); -+ } else if (S_ISBLK(ino->i_mode)) { -+ err = dump_content_blkdev(file, ctx); -+ } else if (S_ISFIFO(ino->i_mode)) { -+ err = dump_content_fifo(file, ctx); -+ } else if (S_ISSOCK(ino->i_mode)) { -+ err = dump_content_socket(file, ctx); -+ } else { -+ eprintk_ctx("unknown inode mode %o\n", ino->i_mode & S_IFMT); -+ err = -EINVAL; -+ } -+ } -+ cpt_close_object(ctx); -+ -+ return err; -+} -+ -+int cpt_dump_files(struct cpt_context *ctx) -+{ -+ int epoll_nr; -+ cpt_object_t *obj; -+ -+ cpt_open_section(ctx, CPT_SECT_TTY); -+ for_each_object(obj, CPT_OBJ_TTY) { -+ int err; -+ -+ if ((err = cpt_dump_tty(obj, ctx)) != 0) -+ return err; -+ } -+ cpt_close_section(ctx); -+ -+ cpt_open_section(ctx, CPT_SECT_INODE); -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file = obj->o_obj; -+ int err; -+ -+ if ((err = dump_one_inode(file, file->f_dentry, -+ file->f_vfsmnt, ctx)) != 0) -+ return err; -+ } -+ for_each_object(obj, CPT_OBJ_FS) { -+ struct fs_struct *fs = obj->o_obj; -+ int err; -+ -+ if (fs->root && -+ (err = dump_one_inode(NULL, fs->root, fs->rootmnt, ctx)) != 0) -+ return err; -+ if (fs->pwd && -+ (err = dump_one_inode(NULL, fs->pwd, fs->pwdmnt, ctx)) != 0) -+ return err; -+ if (fs->altroot && -+ (err = dump_one_inode(NULL, fs->altroot, fs->altrootmnt, ctx)) != 0) -+ return err; -+ } -+ cpt_close_section(ctx); -+ -+ epoll_nr = 0; -+ cpt_open_section(ctx, CPT_SECT_FILES); -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file = obj->o_obj; -+ int err; -+ -+ if ((err = dump_one_file(obj, file, ctx)) != 0) -+ return err; -+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) -+ epoll_nr++; -+ } -+ cpt_close_section(ctx); -+ -+ if (epoll_nr) { -+ cpt_open_section(ctx, CPT_SECT_EPOLL); -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file = obj->o_obj; -+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) { -+ int err; -+ if ((err = cpt_dump_epolldev(obj, ctx)) != 0) -+ return err; -+ } -+ } -+ cpt_close_section(ctx); -+ } -+ -+ cpt_open_section(ctx, CPT_SECT_SOCKET); -+ for_each_object(obj, CPT_OBJ_SOCKET) { -+ int err; -+ -+ if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0) -+ return err; -+ } -+ cpt_close_section(ctx); -+ -+ return 0; -+} -+ -+static int dump_filedesc(int fd, struct file *file, -+ struct files_struct *f, struct cpt_context *ctx) -+{ -+ struct cpt_fd_image *v = cpt_get_buf(ctx); -+ cpt_object_t *obj; -+ -+ cpt_open_object(NULL, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_FILEDESC; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_VOID; -+ -+ v->cpt_fd = fd; -+ obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx); -+ if (!obj) BUG(); -+ v->cpt_file = obj->o_pos; -+ v->cpt_flags = 0; -+ if (FD_ISSET(fd, f->fdt->close_on_exec)) -+ v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC; -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ cpt_close_object(ctx); -+ -+ return 0; -+} -+ -+static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct files_struct *f = obj->o_obj; -+ struct cpt_files_struct_image *v = cpt_get_buf(ctx); -+ int fd; -+ loff_t saved_obj; -+ -+ cpt_open_object(obj, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_FILES; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_index = obj->o_index; -+ v->cpt_max_fds = f->fdt->max_fds; -+ v->cpt_next_fd = f->fdt->next_fd; -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ for (fd = 0; fd < f->fdt->max_fds; fd++) { -+ struct file *file = fcheck_files(f, fd); -+ if (file) -+ dump_filedesc(fd, file, f, ctx); -+ } -+ cpt_pop_object(&saved_obj, ctx); -+ -+ cpt_close_object(ctx); -+ -+ return 0; -+} -+ -+int cpt_dump_files_struct(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ cpt_open_section(ctx, CPT_SECT_FILES_STRUCT); -+ -+ for_each_object(obj, CPT_OBJ_FILES) { -+ int err; -+ -+ if ((err = dump_one_file_struct(obj, ctx)) != 0) -+ return err; -+ } -+ -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+int cpt_collect_fs(cpt_context_t * ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ if (tsk->fs) { -+ if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL) -+ return -ENOMEM; -+ if (tsk->fs->pwd && -+ cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd->d_inode, ctx) == NULL) -+ return -ENOMEM; -+ if (tsk->fs->root && -+ cpt_object_add(CPT_OBJ_INODE, tsk->fs->root->d_inode, ctx) == NULL) -+ return -ENOMEM; -+ if (tsk->fs->altroot && -+ cpt_object_add(CPT_OBJ_INODE, tsk->fs->altroot->d_inode, ctx) == NULL) -+ return -ENOMEM; -+ } -+ } -+ return 0; -+} -+ -+static int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) -+{ -+ struct file file; -+ -+ memset(&file, 0, sizeof(file)); -+ -+ file.f_dentry = d; -+ file.f_vfsmnt = mnt; -+ file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK; -+ return dump_one_file(NULL, &file, ctx); -+} -+ -+static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct fs_struct *fs = obj->o_obj; -+ struct cpt_fs_struct_image *v = cpt_get_buf(ctx); -+ loff_t saved_obj; -+ int err; -+ -+ cpt_open_object(obj, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_FS; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_umask = fs->umask; -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ err = cpt_dump_dir(fs->root, fs->rootmnt, ctx); -+ if (!err) -+ err = cpt_dump_dir(fs->pwd, fs->pwdmnt, ctx); -+ if (!err && fs->altroot) -+ err = cpt_dump_dir(fs->altroot, fs->altrootmnt, ctx); -+ -+ cpt_pop_object(&saved_obj, ctx); -+ -+ cpt_close_object(ctx); -+ -+ return err; -+} -+ -+int cpt_dump_fs_struct(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ cpt_open_section(ctx, CPT_SECT_FS); -+ -+ for_each_object(obj, CPT_OBJ_FS) { -+ int err; -+ -+ if ((err = dump_one_fs(obj, ctx)) != 0) -+ return err; -+ } -+ -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ int err = 0; -+ struct namespace *n = obj->o_obj; -+ struct list_head *p; -+ char *path_buf, *path; -+ -+ path_buf = (char *) __get_free_page(GFP_KERNEL); -+ if (!path_buf) -+ return -ENOMEM; -+ -+ down_read(&namespace_sem); -+ list_for_each(p, &n->list) { -+ struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); -+ -+ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); -+ if (IS_ERR(path)) -+ continue; -+ -+ if ( -+ strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && -+ strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && -+ strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && -+ strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && -+ strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && -+ strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && -+ strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0) { -+ eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name); -+ err = -EINVAL; -+ break; -+ } -+ } -+ up_read(&namespace_sem); -+ -+ free_page((unsigned long) path_buf); -+ -+ return err; -+} -+ -+int cpt_collect_namespace(cpt_context_t * ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ if (tsk->namespace && cpt_object_add(CPT_OBJ_NAMESPACE, tsk->namespace, ctx) == NULL) -+ return -ENOMEM; -+ } -+ -+ for_each_object(obj, CPT_OBJ_NAMESPACE) { -+ int err; -+ if ((err = check_one_namespace(obj, ctx)) != 0) -+ return err; -+ } -+ -+ return 0; -+} -+ -+struct args_t -+{ -+ int* pfd; -+ char* path; -+}; -+ -+static int dumptmpfs(void *arg) -+{ -+ int i; -+ struct args_t *args = arg; -+ int *pfd = args->pfd; -+ char *path = args->path; -+ char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL }; -+ -+ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); -+ if (i < 0) { -+ eprintk("cannot enter ve to dump tmpfs\n"); -+ module_put(THIS_MODULE); -+ return 1; -+ } -+ -+ if (pfd[1] != 1) -+ sc_dup2(pfd[1], 1); -+ -+ for (i=0; i<current->files->fdt->max_fds; i++) { -+ if (i != 1) -+ sc_close(i); -+ } -+ -+ module_put(THIS_MODULE); -+ -+ set_fs(KERNEL_DS); -+ i = sc_execve("/bin/tar", argv, NULL); -+ eprintk("failed to exec /bin/tar: %d\n", i); -+ return -1; -+} -+ -+static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx) -+{ -+ int err; -+ int pid; -+ int pfd[2]; -+ struct file *f; -+ struct cpt_object_hdr v; -+ char buf[16]; -+ int n; -+ loff_t saved_obj; -+ struct args_t args; -+ -+ err = sc_pipe(pfd); -+ if (err < 0) -+ return err; -+ args.pfd = pfd; -+ args.path = path; -+ err = pid = local_kernel_thread(dumptmpfs, (void*)&args, SIGCHLD, 0); -+ if (err < 0) -+ goto out; -+ f = fget(pfd[0]); -+ sc_close(pfd[1]); -+ sc_close(pfd[0]); -+ -+ cpt_push_object(&saved_obj, ctx); -+ cpt_open_object(NULL, ctx); -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_NAME; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_NAME; -+ -+ ctx->write(&v, sizeof(v), ctx); -+ -+ do { -+ mm_segment_t oldfs; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); -+ set_fs(oldfs); -+ if (n > 0) -+ ctx->write(buf, n, ctx); -+ } while (n > 0); -+ -+ fput(f); -+ -+ if ((err = sc_waitx(pid, 0)) < 0) -+ eprintk_ctx("wait4: %d\n", err); -+ -+ buf[0] = 0; -+ ctx->write(buf, 1, ctx); -+ ctx->align(ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ return n; -+ -+out: -+ if (pfd[1] >= 0) -+ sc_close(pfd[1]); -+ if (pfd[0] >= 0) -+ sc_close(pfd[0]); -+ return err; -+} -+ -+static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx) -+{ -+ int err = 0; -+ struct cpt_vfsmount_image v; -+ loff_t saved_obj; -+ char *path_buf, *path; -+ -+ path_buf = (char *) __get_free_page(GFP_KERNEL); -+ if (!path_buf) -+ return -ENOMEM; -+ -+ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); -+ if (IS_ERR(path)) { -+ free_page((unsigned long) path_buf); -+ return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path); -+ } -+ -+ cpt_open_object(NULL, ctx); -+ -+ v.cpt_next = -1; -+ v.cpt_object = CPT_OBJ_VFSMOUNT; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_ARRAY; -+ -+ v.cpt_mntflags = mnt->mnt_flags; -+ v.cpt_flags = mnt->mnt_sb->s_flags; -+ -+ ctx->write(&v, sizeof(v), ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ cpt_dump_string(mnt->mnt_devname ? : "none", ctx); -+ cpt_dump_string(path, ctx); -+ cpt_dump_string(mnt->mnt_sb->s_type->name, ctx); -+#if 0 -+ /* This is an evident crap. Ask Savochkin, he might know this. -+ * Goal is to get some path to mount --bind to. -+ */ -+ cpt_dump_dentry(mnt->mnt_root, mnt->mnt_parent, ctx); -+#else -+ /* For now we just bail, when some FS is mounted not at root. */ -+ if (mnt->mnt_root != mnt->mnt_sb->s_root) { -+ eprintk_ctx("mount --bind prevents checkpointing\n"); -+ err = -EINVAL; -+ } -+#endif -+ -+ if (strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) { -+ cpt_dump_tmpfs(path, ctx); -+ } -+ -+ cpt_pop_object(&saved_obj, ctx); -+ -+ cpt_close_object(ctx); -+ -+ free_page((unsigned long) path_buf); -+ -+ return err; -+} -+ -+static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct namespace *n = obj->o_obj; -+ struct cpt_object_hdr v; -+ struct list_head *p; -+ loff_t saved_obj; -+ int err = 0; -+ -+ cpt_open_object(obj, ctx); -+ -+ v.cpt_next = -1; -+ v.cpt_object = CPT_OBJ_NAMESPACE; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_ARRAY; -+ -+ ctx->write(&v, sizeof(v), ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ -+ down_read(&namespace_sem); -+ list_for_each(p, &n->list) { -+ err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx); -+ if (err) -+ break; -+ } -+ up_read(&namespace_sem); -+ -+ cpt_pop_object(&saved_obj, ctx); -+ -+ cpt_close_object(ctx); -+ -+ return err; -+} -+ -+int cpt_dump_namespace(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ cpt_open_section(ctx, CPT_SECT_NAMESPACE); -+ -+ for_each_object(obj, CPT_OBJ_NAMESPACE) { -+ int err; -+ -+ if ((err = dump_one_namespace(obj, ctx)) != 0) -+ return err; -+ } -+ -+ cpt_close_section(ctx); -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_files.h linux-2.6.16-026test009/kernel/cpt/cpt_files.h ---- linux-2.6.16.orig/kernel/cpt/cpt_files.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_files.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,46 @@ -+int cpt_collect_files(cpt_context_t *); -+int cpt_collect_fs(cpt_context_t *); -+int cpt_collect_namespace(cpt_context_t *); -+int cpt_collect_sysvsem_undo(cpt_context_t *); -+int cpt_collect_tty(struct file *, cpt_context_t *); -+int cpt_dump_files(struct cpt_context *ctx); -+int cpt_dump_files_struct(struct cpt_context *ctx); -+int cpt_dump_fs_struct(struct cpt_context *ctx); -+int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx); -+int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx); -+int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx); -+struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx); -+struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx); -+__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx); -+ -+int rst_posix_locks(struct cpt_context *ctx); -+ -+struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx); -+int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx); -+__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx); -+int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx); -+int rst_restore_fs(struct cpt_context *ctx); -+ -+int cpt_collect_sysv(cpt_context_t *); -+int cpt_dump_sysvsem(struct cpt_context *ctx); -+int rst_sysv_ipc(struct cpt_context *ctx); -+int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx); -+__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx); -+ -+int cpt_dump_namespace(struct cpt_context *ctx); -+int rst_root_namespace(struct cpt_context *ctx); -+ -+int rst_stray_files(struct cpt_context *ctx); -+int rst_tty_jobcontrol(struct cpt_context *ctx); -+ -+void rst_flush_filejobs(struct cpt_context *); -+int rst_do_filejobs(struct cpt_context *); -+ -+int rst_eventpoll(struct cpt_context *); -+struct file *cpt_open_epolldev(struct cpt_file_image *fi, -+ unsigned flags, -+ struct cpt_context *ctx); -+int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *); -+ -+int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, -+ cpt_context_t *ctx); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_fsmagic.h linux-2.6.16-026test009/kernel/cpt/cpt_fsmagic.h ---- linux-2.6.16.orig/kernel/cpt/cpt_fsmagic.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_fsmagic.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,15 @@ -+/* Collected from kernel sources. */ -+ -+#define FSMAGIC_TMPFS 0x01021994 -+#define FSMAGIC_PIPEFS 0x50495045 -+#define FSMAGIC_SOCKFS 0x534F434B -+#define FSMAGIC_PFMFS 0xa0b4d889 -+#define FSMAGIC_BDEV 0x62646576 -+#define FSMAGIC_EPOLL 0x03111965 -+#define FSMAGIC_FUTEX 0x0BAD1DEA -+#define FSMAGIC_MQUEUE 0x19800202 -+#define FSMAGIC_PROC 0x9fa0 -+#define FSMAGIC_DEVPTS 0x1CD1 -+#define FSMAGIC_AUTOFS 0x0187 -+#define FSMAGIC_EXT2 0xEF53 -+#define FSMAGIC_REISER 0x52654973 -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_kernel.c linux-2.6.16-026test009/kernel/cpt/cpt_kernel.c ---- linux-2.6.16.orig/kernel/cpt/cpt_kernel.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_kernel.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,124 @@ -+/* -+ * -+ * kernel/cpt/cpt_kernel.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#define __KERNEL_SYSCALLS__ 1 -+ -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/sched.h> -+#include <linux/mm.h> -+#include <linux/kernel.h> -+#include <asm/cpufeature.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_kernel.h" -+#include "cpt_syscalls.h" -+ -+#ifndef CONFIG_X86_64 -+ -+extern void local_kernel_thread_helper(void); -+__asm__(".section .text\n" -+ ".align 4\n" -+ "local_kernel_thread_helper:\n\t" -+ "movl %edx,%eax\n\t" -+ "pushl %edx\n\t" -+ "call *%ebx\n\t" -+ "pushl %eax\n\t" -+ "pushl $0\n\t" -+ "call complete_and_exit\n" -+ ".previous"); -+ -+/* -+ * Create a kernel thread -+ */ -+int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) -+{ -+ struct pt_regs regs; -+ -+ memset(®s, 0, sizeof(regs)); -+ -+ regs.ebx = (unsigned long) fn; -+ regs.edx = (unsigned long) arg; -+ -+ regs.xds = __USER_DS; -+ regs.xes = __USER_DS; -+ regs.orig_eax = -1; -+ regs.eip = (unsigned long) local_kernel_thread_helper; -+ regs.xcs = __KERNEL_CS; -+ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; -+ -+ /* Ok, create the new process.. */ -+ return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL, pid); -+} -+#endif -+ -+int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) -+{ -+ pid_t ret; -+ -+ if (!try_module_get(THIS_MODULE)) -+ return -EBUSY; -+ ret = asm_kernel_thread(fn, arg, flags, pid); -+ if (ret < 0) -+ module_put(THIS_MODULE); -+ return ret; -+} -+ -+#ifdef __i386__ -+static int errno; -+#endif -+ -+int sc_execve(char *cmd, char **argv, char **env) -+{ -+ int ret; -+ ret = execve(cmd, argv, env); -+#ifdef __i386__ -+ if (ret < 0) -+ ret = -errno; -+#endif -+ return ret; -+} -+ -+unsigned int test_cpu_caps() -+{ -+ unsigned int flags = 0; -+ if (boot_cpu_has(X86_FEATURE_CMOV)) -+ flags |= 1 << CPT_CPU_X86_CMOV; -+ if (cpu_has_fxsr) -+ flags |= 1 << CPT_CPU_X86_FXSR; -+ if (cpu_has_xmm) -+ flags |= 1 << CPT_CPU_X86_SSE; -+#ifndef CONFIG_X86_64 -+ if (cpu_has_xmm2) -+#endif -+ flags |= 1 << CPT_CPU_X86_SSE2; -+ if (cpu_has_mmx) -+ flags |= 1 << CPT_CPU_X86_MMX; -+ if (boot_cpu_has(X86_FEATURE_3DNOW)) -+ flags |= 1 << CPT_CPU_X86_3DNOW; -+ if (boot_cpu_has(X86_FEATURE_3DNOWEXT)) -+ flags |= 1 << CPT_CPU_X86_3DNOW2; -+ if (boot_cpu_has(X86_FEATURE_SEP)) -+ flags |= 1 << CPT_CPU_X86_SEP; -+#ifdef CONFIG_X86_64 -+ flags |= 1 << CPT_CPU_X86_EMT64; -+#endif -+ return flags; -+} -+ -+unsigned int test_kernel_config() -+{ -+ unsigned int flags = 0; -+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -+ flags |= 1 << CPT_KERNEL_CONFIG_PAE; -+#endif -+ return flags; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_kernel.h linux-2.6.16-026test009/kernel/cpt/cpt_kernel.h ---- linux-2.6.16.orig/kernel/cpt/cpt_kernel.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_kernel.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,74 @@ -+/* Interface to kernel vars which we had to _add_. */ -+ -+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -+ -+#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) -+#define TASK_TRACED TASK_STOPPED -+#define unix_peer(sk) ((sk)->sk_pair) -+#define page_mapcount(pg) ((pg)->mapcount) -+#else -+#define unix_peer(sk) (unix_sk(sk)->peer) -+#endif -+ -+#ifdef CONFIG_X86_64 -+#define cpu_has_fxsr 1 -+#endif -+ -+static inline void do_gettimespec(struct timespec *ts) -+{ -+ struct timeval tv; -+ do_gettimeofday(&tv); -+ ts->tv_sec = tv.tv_sec; -+ ts->tv_nsec = tv.tv_usec*1000; -+} -+ -+int local_kernel_thread(int (*fn)(void *), -+ void * arg, -+ unsigned long flags, -+ pid_t pid); -+int asm_kernel_thread(int (*fn)(void *), -+ void * arg, -+ unsigned long flags, -+ pid_t pid); -+ -+unsigned int test_cpu_caps(void); -+unsigned int test_kernel_config(void); -+ -+#define test_one_flag(src, dst, flag, message, ret) \ -+if (src & (1 << flag)) \ -+ if (!(dst & (1 << flag))) { \ -+ wprintk("Destination cpu does not have " message "\n"); \ -+ ret = 1; \ -+ } -+ -+static inline void -+_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) -+{ -+ while (nsec >= NSEC_PER_SEC) { -+ nsec -= NSEC_PER_SEC; -+ ++sec; -+ } -+ while (nsec < 0) { -+ nsec += NSEC_PER_SEC; -+ --sec; -+ } -+ ts->tv_sec = sec; -+ ts->tv_nsec = nsec; -+} -+ -+static inline struct timespec -+_ns_to_timespec(const nsec_t nsec) -+{ -+ struct timespec ts; -+ -+ if (!nsec) -+ return (struct timespec) {0, 0}; -+ -+ ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); -+ if (unlikely(nsec < 0)) -+ _set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); -+ -+ return ts; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_mm.c linux-2.6.16-026test009/kernel/cpt/cpt_mm.c ---- linux-2.6.16.orig/kernel/cpt/cpt_mm.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_mm.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,826 @@ -+/* -+ * -+ * kernel/cpt/cpt_mm.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/hugetlb.h> -+#include <linux/errno.h> -+#include <linux/ve.h> -+#include <linux/pagemap.h> -+#include <linux/rmap.h> -+#include <asm/ldt.h> -+#include <asm/mmu.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_kernel.h" -+#include "cpt_fsmagic.h" -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+#include "cpt_pagein.h" -+#endif -+#include "cpt_ubc.h" -+ -+static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, -+ cpt_context_t *ctx) -+{ -+ if (!list_empty(&aio_ctx->run_list)) { -+ /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */ -+ eprintk_ctx("run list is not empty, cannot suspend AIO\n"); -+ return -EBUSY; -+ } -+ -+ /* Wait for pending IOCBs. Linux AIO is mostly _fake_. -+ * It is actually synchronous, except for direct IO and -+ * some funny raw USB things, which cannot happen inside VE. -+ * However, we do this for future. -+ * -+ * Later note: in 2.6.16 we may allow O_DIRECT, so that -+ * it is not meaningless code. -+ */ -+ wait_for_all_aios(aio_ctx); -+ -+ if (!list_empty(&aio_ctx->run_list) || -+ !list_empty(&aio_ctx->active_reqs) || -+ aio_ctx->reqs_active) { -+ eprintk_ctx("were not able to suspend AIO\n"); -+ return -EBUSY; -+ } -+ -+ return 0; -+} -+ -+static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx) -+{ -+ struct vm_area_struct *vma; -+ -+ for (vma = mm->mmap; vma; vma = vma->vm_next) { -+ if (vma->vm_file) { -+ if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL) -+ return -ENOMEM; -+ } -+ } -+ if (cpt_add_ubc(mm->mm_ub, ctx) == NULL) -+ return -ENOMEM; -+ -+ if (mm->ioctx_list) { -+ struct kioctx *aio_ctx; -+ int err; -+ -+ for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) -+ if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0) -+ return err; -+ } -+ -+ return 0; -+} -+ -+int cpt_collect_mm(cpt_context_t * ctx) -+{ -+ cpt_object_t *obj; -+ int err; -+ int index; -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL) -+ return -ENOMEM; -+ } -+ -+ index = 1; -+ for_each_object(obj, CPT_OBJ_MM) { -+ struct mm_struct *mm = obj->o_obj; -+ if (obj->o_count != atomic_read(&mm->mm_users)) { -+ eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users)); -+ return -EBUSY; -+ } -+ cpt_obj_setindex(obj, index++, ctx); -+ -+ if ((err = collect_one_mm(mm, ctx)) != 0) -+ return err; -+ } -+ -+ return 0; -+} -+ -+static int zcnt, scnt, scnt0, ucnt; -+ -+/* Function where_is_anon_page() returns address of a anonymous page in mm -+ * of already dumped process. This happens f.e. after fork(). We do not use -+ * this right now, just keep statistics, it is diffucult to restore such state, -+ * but the most direct use is to save space in dumped image. */ -+ -+ -+static inline unsigned long -+vma_address0(struct page *page, struct vm_area_struct *vma) -+{ -+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); -+ unsigned long address; -+ -+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); -+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) -+ address |= 1; -+ return address; -+} -+ -+static int really_this_one(struct vm_area_struct *vma, unsigned long address, -+ struct page *page) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ pgd_t *pgd; -+ pud_t *pud; -+ pmd_t *pmd; -+ pte_t *pte; -+ spinlock_t *ptl; -+ int result; -+ -+ pgd = pgd_offset(mm, address); -+ if (unlikely(!pgd_present(*pgd))) -+ return 0; -+ -+ pud = pud_offset(pgd, address); -+ if (!pud_present(*pud)) -+ return 0; -+ -+ pmd = pmd_offset(pud, address); -+ if (unlikely(!pmd_present(*pmd))) -+ return 0; -+ -+ result = 0; -+ pte = pte_offset_map(pmd, address); -+ if (!pte_present(*pte)) { -+ pte_unmap(pte); -+ return 0; -+ } -+ -+ ptl = pte_lockptr(mm, pmd); -+ if (!spin_trylock(ptl)) { -+ pte_unmap(pte); -+ return 0; -+ } -+ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) -+ result = 1; -+ pte_unmap_unlock(pte, ptl); -+ return result; -+} -+ -+static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr, -+ struct page *page, cpt_context_t * ctx) -+{ -+ loff_t mmptr = CPT_NULL; -+ struct anon_vma *anon_vma; -+ struct vm_area_struct *vma; -+ int idx = mmobj->o_index; -+ -+ if (!PageAnon(page)) -+ return CPT_NULL; -+ -+ anon_vma = page_lock_anon_vma(page); -+ if (!anon_vma) -+ return CPT_NULL; -+ -+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { -+ unsigned long addr = vma_address0(page, vma); -+ cpt_object_t *obj; -+ -+ /* We do not try to support mremapped regions (addr != mapaddr), -+ * only mmaps directly inherited via fork(). -+ * With this limitation we may check self-consistency of -+ * vmas (vm_start, vm_pgoff, anon_vma) before -+ * doing __copy_page_range() in rst_mm. -+ */ -+ if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) { -+ obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx); -+ if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) { -+ if (really_this_one(vma, addr, page)) { -+ mmptr = obj->o_pos; -+ idx = obj->o_index; -+ } -+ } -+ } -+ } -+ spin_unlock(&anon_vma->lock); -+ -+ return mmptr; -+} -+ -+struct page_area -+{ -+ int type; -+ unsigned long start; -+ unsigned long end; -+ unsigned long pgoff; -+ loff_t mm; -+}; -+ -+struct page_desc -+{ -+ int type; -+ int index; -+ loff_t mm; -+ int shared; -+}; -+ -+enum { -+ PD_ABSENT, -+ PD_COPY, -+ PD_ZERO, -+ PD_CLONE, -+ PD_FUNKEY, -+ PD_LAZY -+}; -+ -+/* 0: page can be obtained from backstore, or still not mapped anonymous page, -+ or something else, which does not requre copy. -+ 1: page requires copy -+ 2: page requres copy but its content is zero. Quite useless. -+ 3: wp page is shared after fork(). It is to be COWed when modified. -+ 4: page is something unsupported... We copy it right now. -+ */ -+ -+ -+ -+static void page_get_desc(cpt_object_t *mmobj, -+ struct vm_area_struct *vma, unsigned long addr, -+ struct page_desc *pdesc, cpt_context_t * ctx) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ pgd_t *pgd; -+ pud_t *pud; -+ pmd_t *pmd; -+ pte_t *ptep, pte; -+ spinlock_t *ptl; -+ struct page *pg; -+ int linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff; -+ -+ pdesc->index = linear_index; -+ pdesc->shared = 0; -+ -+ if (vma->vm_flags & VM_IO) { -+ pdesc->type = PD_ABSENT; -+ return; -+ } -+ -+ pgd = pgd_offset(mm, addr); -+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) -+ goto out_absent; -+ pud = pud_offset(pgd, addr); -+ if (pud_none(*pud) || unlikely(pud_bad(*pud))) -+ goto out_absent; -+ pmd = pmd_offset(pud, addr); -+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) -+ goto out_absent; -+ if (pmd_huge(*pmd)) { -+ eprintk_ctx("page_huge\n"); -+ goto out_unsupported; -+ } -+ -+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); -+ if (!ptep) -+ goto out_absent; -+ -+ pte = *ptep; -+ if (pte_none(pte)) -+ goto out_absent_unmap; -+ -+ if (!pte_present(pte)) { -+ if (pte_file(pte)) { -+ pdesc->index = pte_to_pgoff(pte); -+ goto out_absent_unmap; -+ } -+ if (vma->vm_flags & VM_SHARED) { -+ /* It is impossible: shared mappings cannot be in swap */ -+ eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos); -+ goto out_unsupported_unmap; -+ } -+ /* Otherwise it is in swap. */ -+ goto out_lazy_unmap; -+ } else if ((pg = vm_normal_page(vma, addr, pte)) != NULL) { -+ -+ if (pg->mapping && !PageAnon(pg)) { -+ if (vma->vm_file == NULL) { -+ eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr); -+ goto out_unsupported_unmap; -+ } -+ if (vma->vm_file->f_mapping != pg->mapping) { -+ eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n", addr, vma->vm_file->f_mapping, pg->mapping, mmobj->o_pos); -+ goto out_unsupported_unmap; -+ } -+ pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); -+ /* Page is in backstore. For us it is like -+ * it is not present. -+ */ -+ goto out_absent_unmap; -+ } -+ -+ if (PageReserved(pg)) { -+ /* Special case: ZERO_PAGE is used, when an -+ * anonymous page is accessed but not written. */ -+ if (pg == ZERO_PAGE(addr)) { -+ if (pte_write(pte)) { -+ eprintk_ctx("not funny already, writable ZERO_PAGE\n"); -+ goto out_unsupported_unmap; -+ } -+ zcnt++; -+ goto out_absent_unmap; -+ } -+ eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index, addr, mmobj->o_pos); -+ goto out_unsupported_unmap; -+ } -+ -+ if (pg == ZERO_PAGE(addr)) { -+ wprintk_ctx("that's how it works now\n"); -+ } -+ -+ if (!pg->mapping) { -+ eprintk_ctx("page without mapping at %08lx@%Ld\n", addr, mmobj->o_pos); -+ goto out_unsupported_unmap; -+ } -+ -+ if (pg->mapping && page_mapcount(pg) > 1) { -+ pdesc->shared = 1; -+ pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx); -+ if (pdesc->mm != CPT_NULL) { -+ scnt0++; -+ goto out_clone_unmap; -+ } else { -+ scnt++; -+ } -+ } -+ -+ if (!pte_young(pte)) -+ goto out_lazy_unmap; -+ } -+ pte_unmap_unlock(ptep, ptl); -+ pdesc->type = PD_COPY; -+ return; -+ -+out_lazy_unmap: -+ pte_unmap_unlock(ptep, ptl); -+ pdesc->type = PD_LAZY; -+ return; -+ -+out_absent_unmap: -+ pte_unmap_unlock(ptep, ptl); -+out_absent: -+ pdesc->type = PD_ABSENT; -+ return; -+ -+out_clone_unmap: -+ pte_unmap_unlock(ptep, ptl); -+ pdesc->type = PD_CLONE; -+ return; -+ -+out_unsupported_unmap: -+ pte_unmap_unlock(ptep, ptl); -+out_unsupported: -+ ucnt++; -+ pdesc->type = PD_FUNKEY; -+ return; -+} -+ -+/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages() -+ * does not really need this thing. It just stores some page fault stats there. -+ * -+ * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages -+ * before accessing vma. -+ */ -+void dump_pages(struct vm_area_struct *vma, unsigned long start, -+ unsigned long end, struct cpt_context *ctx) -+{ -+#define MAX_PAGE_BATCH 16 -+ struct page *pg[MAX_PAGE_BATCH]; -+ int npages = (end - start)/PAGE_SIZE; -+ int count = 0; -+ -+ while (count < npages) { -+ int copy = npages - count; -+ int n; -+ -+ if (copy > MAX_PAGE_BATCH) -+ copy = MAX_PAGE_BATCH; -+ n = get_user_pages(current, vma->vm_mm, start, copy, -+ 0, 1, pg, NULL); -+ if (n == copy) { -+ int i; -+ for (i=0; i<n; i++) { -+ char *maddr = kmap(pg[i]); -+ ctx->write(maddr, PAGE_SIZE, ctx); -+ kunmap(pg[i]); -+ } -+ } else { -+ eprintk_ctx("get_user_pages fault"); -+ for ( ; n > 0; n--) -+ page_cache_release(pg[n-1]); -+ return; -+ } -+ start += n*PAGE_SIZE; -+ count += n; -+ for ( ; n > 0; n--) -+ page_cache_release(pg[n-1]); -+ } -+ return; -+} -+ -+int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb, -+ int copy, -+ struct cpt_context *ctx) -+{ -+ loff_t saved_object; -+ -+ cpt_push_object(&saved_object, ctx); -+ -+ pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES; -+ pgb->cpt_hdrlen = sizeof(*pgb); -+ pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID; -+ -+ ctx->write(pgb, sizeof(*pgb), ctx); -+ if (copy == PD_COPY || copy == PD_LAZY) -+ dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_object, ctx); -+ return 0; -+} -+ -+int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa, -+ struct cpt_context *ctx) -+{ -+ struct cpt_remappage_block pgb; -+ loff_t saved_object; -+ -+ cpt_push_object(&saved_object, ctx); -+ -+ pgb.cpt_object = CPT_OBJ_REMAPPAGES; -+ pgb.cpt_hdrlen = sizeof(pgb); -+ pgb.cpt_content = CPT_CONTENT_VOID; -+ pgb.cpt_start = pa->start; -+ pgb.cpt_end = pa->end; -+ pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1; -+ -+ ctx->write(&pgb, sizeof(pgb), ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_object, ctx); -+ return 0; -+} -+ -+int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa, -+ struct cpt_context *ctx) -+{ -+ struct cpt_copypage_block pgb; -+ loff_t saved_object; -+ -+ cpt_push_object(&saved_object, ctx); -+ -+ pgb.cpt_object = CPT_OBJ_COPYPAGES; -+ pgb.cpt_hdrlen = sizeof(pgb); -+ pgb.cpt_content = CPT_CONTENT_VOID; -+ pgb.cpt_start = pa->start; -+ pgb.cpt_end = pa->end; -+ pgb.cpt_source = pa->mm; -+ -+ ctx->write(&pgb, sizeof(pgb), ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_object, ctx); -+ return 0; -+} -+ -+int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa, -+ cpt_context_t *ctx) -+{ -+ struct cpt_lazypage_block pgb; -+ loff_t saved_object; -+ -+ cpt_push_object(&saved_object, ctx); -+ -+ pgb.cpt_object = CPT_OBJ_LAZYPAGES; -+ pgb.cpt_hdrlen = sizeof(pgb); -+ pgb.cpt_content = CPT_CONTENT_VOID; -+ pgb.cpt_start = pa->start; -+ pgb.cpt_end = pa->end; -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start, -+ (pa->end-pa->start)/PAGE_SIZE, ctx); -+#endif -+ ctx->write(&pgb, sizeof(pgb), ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_object, ctx); -+ return 0; -+} -+ -+static int can_expand(struct page_area *pa, struct page_desc *pd) -+{ -+ if (pa->start == pa->end) -+ return 1; -+ if (pa->type != pd->type) -+ return 0; -+ if (pa->type == PD_ABSENT) -+ return pd->index == pa->pgoff + 1; -+ if (pa->type == PD_CLONE) -+ return pd->mm == pa->mm; -+ return 1; -+} -+ -+static int dump_one_vma(cpt_object_t *mmobj, -+ struct vm_area_struct *vma, struct cpt_context *ctx) -+{ -+ struct cpt_vma_image *v = cpt_get_buf(ctx); -+ unsigned long addr; -+ loff_t saved_object; -+ struct cpt_page_block pgb; -+ struct page_area pa; -+ int cloned_pages = 0; -+ -+ cpt_push_object(&saved_object, ctx); -+ -+ v->cpt_object = CPT_OBJ_VMA; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_start = vma->vm_start; -+ v->cpt_end = vma->vm_end; -+ v->cpt_flags = vma->vm_flags; -+ if (vma->vm_flags&VM_HUGETLB) { -+ eprintk_ctx("huge TLB VMAs are still not supported\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ v->cpt_pgprot = vma->vm_page_prot.pgprot; -+ v->cpt_pgoff = vma->vm_pgoff; -+ v->cpt_file = CPT_NULL; -+ v->cpt_type = CPT_VMA_TYPE_0; -+ v->cpt_anonvma = 0; -+ -+ /* We have to remember what VMAs are bound to one anon_vma. -+ * So, we store an identifier of group of VMAs. It is handy -+ * to use absolute address of anon_vma as this identifier. */ -+ v->cpt_anonvmaid = (unsigned long)vma->anon_vma; -+ -+ if (vma->vm_file) { -+ struct file *filp; -+ cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx); -+ if (obj == NULL) BUG(); -+ filp = obj->o_obj; -+ if (filp->f_op && -+ filp->f_op->read == NULL && -+ filp->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_TMPFS) -+ v->cpt_type = CPT_VMA_TYPE_SHM; -+ v->cpt_file = obj->o_pos; -+ } -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ pa.type = PD_ABSENT; -+ pa.pgoff = vma->vm_pgoff; -+ pa.mm = CPT_NULL; -+ pa.start = vma->vm_start; -+ pa.end = vma->vm_start; -+ -+ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { -+ struct page_desc pd; -+ -+ page_get_desc(mmobj, vma, addr, &pd, ctx); -+ cloned_pages += pd.shared; -+ -+ if (pd.type == PD_FUNKEY) { -+ eprintk_ctx("dump_one_vma: funkey page\n"); -+ return -EINVAL; -+ } -+ -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ if (pd.type == PD_LAZY && -+ (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED))) -+ pd.type = PD_COPY; -+#else -+ if (pd.type == PD_LAZY) -+ pd.type = PD_COPY; -+#endif -+ -+ if (!can_expand(&pa, &pd)) { -+ if (pa.type == PD_COPY || -+ pa.type == PD_ZERO) { -+ pgb.cpt_start = pa.start; -+ pgb.cpt_end = pa.end; -+ dump_page_block(vma, &pgb, pa.type, ctx); -+ } else if (pa.type == PD_CLONE) { -+ dump_copypage_block(vma, &pa, ctx); -+ cloned_pages++; -+ } else if (pa.type == PD_LAZY) { -+ dump_lazypage_block(vma, &pa, ctx); -+ } else if (pa.type == PD_ABSENT && -+ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { -+ dump_remappage_block(vma, &pa, ctx); -+ } -+ pa.start = addr; -+ } -+ pa.type = pd.type; -+ pa.end = addr + PAGE_SIZE; -+ pa.pgoff = pd.index; -+ pa.mm = pd.mm; -+ } -+ -+ if (pa.end > pa.start) { -+ if (pa.type == PD_COPY || -+ pa.type == PD_ZERO) { -+ pgb.cpt_start = pa.start; -+ pgb.cpt_end = pa.end; -+ dump_page_block(vma, &pgb, pa.type, ctx); -+ } else if (pa.type == PD_CLONE) { -+ dump_copypage_block(vma, &pa, ctx); -+ cloned_pages++; -+ } else if (pa.type == PD_LAZY) { -+ dump_lazypage_block(vma, &pa, ctx); -+ } else if (pa.type == PD_ABSENT && -+ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { -+ dump_remappage_block(vma, &pa, ctx); -+ } -+ } -+ -+ if (cloned_pages) { -+ __u32 anonvma = 1; -+ loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma); -+ ctx->pwrite(&anonvma, 4, ctx, anonpos); -+ } -+ -+ cpt_close_object(ctx); -+ -+ cpt_pop_object(&saved_object, ctx); -+ -+ return 0; -+} -+ -+static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, -+ cpt_context_t *ctx) -+{ -+ loff_t saved_object; -+ struct cpt_aio_ctx_image aimg; -+ -+ if (!list_empty(&aio_ctx->run_list) || -+ !list_empty(&aio_ctx->active_reqs) || -+ aio_ctx->reqs_active) { -+ eprintk_ctx("AIO is active after suspend\n"); -+ return -EBUSY; -+ } -+ -+ cpt_push_object(&saved_object, ctx); -+ -+ aimg.cpt_next = CPT_ALIGN(sizeof(aimg)); -+ aimg.cpt_object = CPT_OBJ_AIO_CONTEXT; -+ aimg.cpt_hdrlen = sizeof(aimg); -+ aimg.cpt_content = CPT_CONTENT_ARRAY; -+ -+ aimg.cpt_max_reqs = aio_ctx->max_reqs; -+ aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages; -+ aimg.cpt_nr = aio_ctx->ring_info.nr; -+ aimg.cpt_tail = aio_ctx->ring_info.tail; -+ aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base; -+ -+ ctx->write(&aimg, sizeof(aimg), ctx); -+ -+ cpt_pop_object(&saved_object, ctx); -+ return 0; -+} -+ -+static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct mm_struct *mm = obj->o_obj; -+ struct vm_area_struct *vma; -+ struct cpt_mm_image *v = cpt_get_buf(ctx); -+ -+ cpt_open_object(obj, ctx); -+ -+ v->cpt_next = -1; -+ v->cpt_object = CPT_OBJ_MM; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_start_code = mm->start_code; -+ v->cpt_end_code = mm->end_code; -+ v->cpt_start_data = mm->start_data; -+ v->cpt_end_data = mm->end_data; -+ v->cpt_start_brk = mm->start_brk; -+ v->cpt_brk = mm->brk; -+ v->cpt_start_stack = mm->start_stack; -+ v->cpt_start_arg = mm->arg_start; -+ v->cpt_end_arg = mm->arg_end; -+ v->cpt_start_env = mm->env_start; -+ v->cpt_end_env = mm->env_end; -+ v->cpt_def_flags = mm->def_flags; -+ v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx); -+ v->cpt_dumpable = mm->dumpable; -+ v->cpt_vps_dumpable = mm->vps_dumpable; -+ v->cpt_used_hugetlb = 0; -+#ifdef CONFIG_HUGETLB_PAGE -+ v->cpt_used_hugetlb = mm->used_hugetlb; -+#endif -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ if (mm->context.size) { -+ loff_t saved_object; -+ struct cpt_obj_bits b; -+ int size; -+ -+ dprintk_ctx("nontrivial LDT\n"); -+ -+ cpt_push_object(&saved_object, ctx); -+ -+ cpt_open_object(NULL, ctx); -+ b.cpt_next = CPT_NULL; -+ b.cpt_object = CPT_OBJ_BITS; -+ b.cpt_hdrlen = sizeof(b); -+ b.cpt_content = CPT_CONTENT_MM_CONTEXT; -+ b.cpt_size = mm->context.size*LDT_ENTRY_SIZE; -+ -+ ctx->write(&b, sizeof(b), ctx); -+ -+ size = mm->context.size*LDT_ENTRY_SIZE; -+ -+#if defined(CONFIG_X86_64) || LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15) -+ ctx->write(mm->context.ldt, size, ctx); -+#else -+ for (i = 0; i < size; i += PAGE_SIZE) { -+ int nr = i / PAGE_SIZE, bytes; -+ char *kaddr = kmap(mm->context.ldt_pages[nr]); -+ -+ bytes = size - i; -+ if (bytes > PAGE_SIZE) -+ bytes = PAGE_SIZE; -+ ctx->write(kaddr, bytes, ctx); -+ kunmap(mm->context.ldt_pages[nr]); -+ } -+#endif -+ -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_object, ctx); -+ } -+ -+ for (vma = mm->mmap; vma; vma = vma->vm_next) { -+ int err; -+ -+#ifdef CONFIG_X86_64 -+ if (vma->vm_start == 0xFFFFE000 && -+ vma->vm_end == 0xFFFFF000) -+ continue; -+#endif -+ -+ if ((err = dump_one_vma(obj, vma, ctx)) != 0) -+ return err; -+ } -+ -+ if (mm->ioctx_list) { -+ struct kioctx *aio_ctx; -+ int err; -+ -+ for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) -+ if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0) -+ return err; -+ } -+ -+ cpt_close_object(ctx); -+ -+ return 0; -+} -+ -+int cpt_dump_vm(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ scnt = scnt0 = zcnt = 0; -+ -+ cpt_open_section(ctx, CPT_SECT_MM); -+ -+ for_each_object(obj, CPT_OBJ_MM) { -+ int err; -+ -+ if ((err = dump_one_mm(obj, ctx)) != 0) -+ return err; -+ } -+ -+ cpt_close_section(ctx); -+ -+ if (scnt) -+ dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt); -+ if (scnt0) -+ dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0); -+ if (zcnt) -+ dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt); -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_mm.h linux-2.6.16-026test009/kernel/cpt/cpt_mm.h ---- linux-2.6.16.orig/kernel/cpt/cpt_mm.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_mm.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,16 @@ -+int cpt_collect_mm(cpt_context_t *); -+ -+int cpt_dump_vm(struct cpt_context *ctx); -+ -+__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx); -+int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx); -+int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx); -+ -+int cpt_mm_prepare(unsigned long veid); -+ -+int cpt_free_pgin_dir(struct cpt_context *); -+int cpt_start_pagein(struct cpt_context *); -+int rst_setup_pagein(struct cpt_context *); -+int rst_complete_pagein(struct cpt_context *, int); -+int rst_pageind(struct cpt_context *); -+int rst_swapoff(struct cpt_context *); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_net.c linux-2.6.16-026test009/kernel/cpt/cpt_net.c ---- linux-2.6.16.orig/kernel/cpt/cpt_net.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_net.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,363 @@ -+/* -+ * -+ * kernel/cpt/cpt_net.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/fs.h> -+#include <linux/socket.h> -+#include <linux/netdevice.h> -+#include <linux/inetdevice.h> -+#include <linux/rtnetlink.h> -+#include <linux/ve.h> -+#include <linux/ve_proto.h> -+#include <linux/vzcalluser.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_kernel.h" -+#include "cpt_syscalls.h" -+ -+int cpt_dump_link(struct cpt_context * ctx) -+{ -+ struct net_device *dev; -+ -+ cpt_open_section(ctx, CPT_SECT_NET_DEVICE); -+ for (dev = dev_base; dev; dev = dev->next) { -+ struct cpt_netdev_image v; -+ -+ cpt_open_object(NULL, ctx); -+ -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_NET_DEVICE; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_VOID; -+ -+ v.cpt_index = dev->ifindex; -+ v.cpt_flags = dev->flags; -+ memcpy(v.cpt_name, dev->name, IFNAMSIZ); -+ ctx->write(&v, sizeof(v), ctx); -+ cpt_close_object(ctx); -+ -+ if (strcmp(dev->name, "lo") != 0 && -+ strcmp(dev->name, "venet0") != 0) { -+ eprintk_ctx("unsupported netdevice %s\n", dev->name); -+ cpt_close_section(ctx); -+ return -EBUSY; -+ } -+ } -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+int cpt_suspend_network(struct cpt_context *ctx) -+{ -+ get_exec_env()->disable_net = 1; -+ synchronize_net(); -+ return 0; -+} -+ -+int cpt_resume_network(struct cpt_context *ctx) -+{ -+ struct ve_struct *env; -+ env = get_ve_by_id(ctx->ve_id); -+ if (!env) -+ return -ESRCH; -+ env->disable_net = 0; -+ put_ve(env); -+ return 0; -+} -+ -+int cpt_dump_ifaddr(struct cpt_context * ctx) -+{ -+ struct net_device *dev; -+ -+ cpt_open_section(ctx, CPT_SECT_NET_IFADDR); -+ for (dev = dev_base; dev; dev = dev->next) { -+ struct in_device *idev = in_dev_get(dev); -+ struct in_ifaddr *ifa; -+ -+ if (!idev) -+ continue; -+ -+ for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) { -+ struct cpt_ifaddr_image v; -+ cpt_open_object(NULL, ctx); -+ -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_NET_IFADDR; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_VOID; -+ -+ v.cpt_index = dev->ifindex; -+ v.cpt_family = AF_INET; -+ v.cpt_masklen = ifa->ifa_prefixlen; -+ v.cpt_flags = ifa->ifa_flags; -+ v.cpt_scope = ifa->ifa_scope; -+ memset(&v.cpt_address, 0, sizeof(v.cpt_address)); -+ memset(&v.cpt_peer, 0, sizeof(v.cpt_peer)); -+ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); -+ v.cpt_address[0] = ifa->ifa_local; -+ v.cpt_peer[0] = ifa->ifa_address; -+ v.cpt_broadcast[0] = ifa->ifa_broadcast; -+ memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ); -+ ctx->write(&v, sizeof(v), ctx); -+ cpt_close_object(ctx); -+ } -+ in_dev_put(idev); -+ } -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+static int cpt_dump_route(struct cpt_context * ctx) -+{ -+ int err; -+ struct socket *sock; -+ struct msghdr msg; -+ struct iovec iov; -+ struct { -+ struct nlmsghdr nlh; -+ struct rtgenmsg g; -+ } req; -+ struct sockaddr_nl nladdr; -+ struct cpt_object_hdr v; -+ mm_segment_t oldfs; -+ char *pg; -+ -+ err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); -+ if (err) -+ return err; -+ -+ memset(&nladdr, 0, sizeof(nladdr)); -+ nladdr.nl_family = AF_NETLINK; -+ -+ req.nlh.nlmsg_len = sizeof(req); -+ req.nlh.nlmsg_type = RTM_GETROUTE; -+ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; -+ req.nlh.nlmsg_pid = 0; -+ req.g.rtgen_family = AF_INET; -+ -+ iov.iov_base=&req; -+ iov.iov_len=sizeof(req); -+ msg.msg_name=&nladdr; -+ msg.msg_namelen=sizeof(nladdr); -+ msg.msg_iov=&iov; -+ msg.msg_iovlen=1; -+ msg.msg_control=NULL; -+ msg.msg_controllen=0; -+ msg.msg_flags=MSG_DONTWAIT; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ err = sock_sendmsg(sock, &msg, sizeof(req)); -+ set_fs(oldfs); -+ -+ if (err < 0) -+ goto out_sock; -+ -+ pg = (char*)__get_free_page(GFP_KERNEL); -+ if (pg == NULL) { -+ err = -ENOMEM; -+ goto out_sock; -+ } -+ -+ cpt_open_section(ctx, CPT_SECT_NET_ROUTE); -+ cpt_open_object(NULL, ctx); -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_NET_ROUTE; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_NLMARRAY; -+ -+ ctx->write(&v, sizeof(v), ctx); -+ -+ -+ for (;;) { -+ struct nlmsghdr *h; -+ -+ iov.iov_base = pg; -+ iov.iov_len = PAGE_SIZE; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); -+ set_fs(oldfs); -+ -+ if (err < 0) -+ goto out_sock_pg; -+ if (msg.msg_flags & MSG_TRUNC) { -+ err = -ENOBUFS; -+ goto out_sock_pg; -+ } -+ -+ h = (struct nlmsghdr*)pg; -+ while (NLMSG_OK(h, err)) { -+ if (h->nlmsg_type == NLMSG_DONE) { -+ err = 0; -+ goto done; -+ } -+ if (h->nlmsg_type == NLMSG_ERROR) { -+ struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h); -+ err = errm->error; -+ eprintk_ctx("NLMSG error: %d\n", errm->error); -+ goto done; -+ } -+ if (h->nlmsg_type != RTM_NEWROUTE) { -+ eprintk_ctx("NLMSG: %d\n", h->nlmsg_type); -+ err = -EINVAL; -+ goto done; -+ } -+ ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx); -+ h = NLMSG_NEXT(h, err); -+ } -+ if (err) { -+ eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type); -+ err = -EINVAL; -+ break; -+ } -+ } -+done: -+ cpt_close_object(ctx); -+ cpt_close_section(ctx); -+ -+out_sock_pg: -+ free_page((unsigned long)pg); -+out_sock: -+ sock_release(sock); -+ return err; -+} -+ -+static int dumpfn(void *arg) -+{ -+ int i; -+ int *pfd = arg; -+ char *argv[] = { "iptables-save", "-c", NULL }; -+ -+ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); -+ if (i < 0) { -+ eprintk("cannot enter ve to dump iptables\n"); -+ module_put(THIS_MODULE); -+ return 1; -+ } -+ -+ if (pfd[1] != 1) -+ sc_dup2(pfd[1], 1); -+ -+ for (i=0; i<current->files->fdt->max_fds; i++) { -+ if (i != 1) -+ sc_close(i); -+ } -+ -+ module_put(THIS_MODULE); -+ -+ set_fs(KERNEL_DS); -+ i = sc_execve("/sbin/iptables-save", argv, NULL); -+ eprintk("failed to exec /sbin/iptables-save: %d\n", i); -+ return -1; -+} -+ -+ -+static int cpt_dump_iptables(struct cpt_context * ctx) -+{ -+ int err; -+ int pid; -+ int pfd[2]; -+ struct file *f; -+ struct cpt_object_hdr v; -+ char buf[16]; -+ loff_t pos; -+ int n; -+ -+ err = sc_pipe(pfd); -+ if (err < 0) { -+ eprintk_ctx("sc_pipe: %d\n", err); -+ return err; -+ } -+ err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); -+ if (err < 0) { -+ eprintk_ctx("local_kernel_thread: %d\n", err); -+ goto out; -+ } -+ f = fget(pfd[0]); -+ sc_close(pfd[1]); -+ sc_close(pfd[0]); -+ -+ cpt_open_section(ctx, CPT_SECT_NET_IPTABLES); -+ -+ cpt_open_object(NULL, ctx); -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_NAME; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_NAME; -+ -+ ctx->write(&v, sizeof(v), ctx); -+ -+ pos = ctx->file->f_pos; -+ do { -+ mm_segment_t oldfs; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); -+ set_fs(oldfs); -+ if (n > 0) -+ ctx->write(buf, n, ctx); -+ } while (n > 0); -+ -+ if (n < 0) -+ eprintk_ctx("read: %d\n", n); -+ -+ fput(f); -+ -+ if ((err = sc_waitx(pid, 0)) < 0) -+ eprintk_ctx("wait4: %d\n", err); -+ -+ if (ctx->file->f_pos != pos) { -+ buf[0] = 0; -+ ctx->write(buf, 1, ctx); -+ ctx->align(ctx); -+ cpt_close_object(ctx); -+ cpt_close_section(ctx); -+ } else { -+ pos = ctx->current_section; -+ cpt_close_object(ctx); -+ cpt_close_section(ctx); -+ ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL; -+ ctx->file->f_pos = pos; -+ } -+ return n; -+ -+out: -+ if (pfd[1] >= 0) -+ sc_close(pfd[1]); -+ if (pfd[0] >= 0) -+ sc_close(pfd[0]); -+ return err; -+} -+ -+int cpt_dump_ifinfo(struct cpt_context * ctx) -+{ -+ int err; -+ -+ err = cpt_dump_link(ctx); -+ if (!err) -+ err = cpt_dump_ifaddr(ctx); -+ if (!err) -+ err = cpt_dump_route(ctx); -+ if (!err) -+ err = cpt_dump_iptables(ctx); -+ return err; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_net.h linux-2.6.16-026test009/kernel/cpt/cpt_net.h ---- linux-2.6.16.orig/kernel/cpt/cpt_net.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_net.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,7 @@ -+int cpt_dump_ifinfo(struct cpt_context *ctx); -+int rst_restore_net(struct cpt_context *ctx); -+int cpt_suspend_network(struct cpt_context *ctx); -+int cpt_resume_network(struct cpt_context *ctx); -+int rst_resume_network(struct cpt_context *ctx); -+int cpt_dump_ip_conntrack(struct cpt_context *ctx); -+int rst_restore_ip_conntrack(struct cpt_context * ctx); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_obj.c linux-2.6.16-026test009/kernel/cpt/cpt_obj.c ---- linux-2.6.16.orig/kernel/cpt/cpt_obj.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_obj.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,172 @@ -+/* -+ * -+ * kernel/cpt/cpt_obj.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ obj = kmalloc(sizeof(cpt_object_t), gfp); -+ if (obj) { -+ INIT_LIST_HEAD(&obj->o_list); -+ INIT_LIST_HEAD(&obj->o_hash); -+ INIT_LIST_HEAD(&obj->o_alist); -+ obj->o_count = 1; -+ obj->o_pos = CPT_NULL; -+ obj->o_lock = 0; -+ obj->o_parent = NULL; -+ obj->o_index = CPT_NOINDEX; -+ obj->o_obj = NULL; -+ obj->o_image = NULL; -+ ctx->objcount++; -+ } -+ return obj; -+} -+// //EXPORT_SYMBOL(alloc_cpt_object); -+ -+void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx) -+{ -+ list_del(&obj->o_alist); -+ kfree(obj); -+ ctx->objcount--; -+} -+ -+void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx) -+{ -+ list_add_tail(&obj->o_list, &ctx->object_array[type]); -+} -+// //EXPORT_SYMBOL(intern_cpt_object); -+ -+void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, -+ cpt_object_t *head, cpt_context_t *ctx) -+{ -+ list_add(&obj->o_list, &head->o_list); -+} -+// //EXPORT_SYMBOL(insert_cpt_object); -+ -+cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p, -+ unsigned gfp_mask, cpt_context_t *ctx) -+{ -+ cpt_object_t *obj; -+ -+ obj = lookup_cpt_object(type, p, ctx); -+ -+ if (obj) { -+ obj->o_count++; -+ return obj; -+ } -+ -+ if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) { -+ if (p) -+ cpt_obj_setobj(obj, p, ctx); -+ intern_cpt_object(type, obj, ctx); -+ return obj; -+ } -+ return NULL; -+} -+// //EXPORT_SYMBOL(__cpt_object_add); -+ -+cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx) -+{ -+ return __cpt_object_add(type, p, GFP_KERNEL, ctx); -+} -+// //EXPORT_SYMBOL(cpt_object_add); -+ -+cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx) -+{ -+ cpt_object_t *obj; -+ -+ obj = lookup_cpt_object(type, p, ctx); -+ -+ if (obj) -+ obj->o_count++; -+ -+ return obj; -+} -+// //EXPORT_SYMBOL(cpt_object_get); -+ -+int cpt_object_init(cpt_context_t *ctx) -+{ -+ int i; -+ -+ for (i=0; i<CPT_OBJ_MAX; i++) { -+ INIT_LIST_HEAD(&ctx->object_array[i]); -+ } -+ return 0; -+} -+ -+int cpt_object_destroy(cpt_context_t *ctx) -+{ -+ int i; -+ -+ for (i=0; i<CPT_OBJ_MAX; i++) { -+ while (!list_empty(&ctx->object_array[i])) { -+ struct list_head *head = ctx->object_array[i].next; -+ cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); -+ list_del(head); -+ if (obj->o_image) -+ kfree(obj->o_image); -+ free_cpt_object(obj, ctx); -+ } -+ } -+ if (ctx->objcount != 0) -+ eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount); -+ return 0; -+} -+ -+cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, type) { -+ if (obj->o_obj == p) -+ return obj; -+ } -+ return NULL; -+} -+// //EXPORT_SYMBOL(lookup_cpt_object); -+ -+cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, type) { -+ if (obj->o_pos == pos) -+ return obj; -+ } -+ return NULL; -+} -+// //EXPORT_SYMBOL(lookup_cpt_obj_bypos); -+ -+cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, type) { -+ if (obj->o_index == index) -+ return obj; -+ } -+ return NULL; -+} -+// //EXPORT_SYMBOL(lookup_cpt_obj_byindex); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_obj.h linux-2.6.16-026test009/kernel/cpt/cpt_obj.h ---- linux-2.6.16.orig/kernel/cpt/cpt_obj.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_obj.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,62 @@ -+#ifndef __CPT_OBJ_H_ -+#define __CPT_OBJ_H_ 1 -+ -+#include <linux/list.h> -+#include <linux/cpt_image.h> -+ -+typedef struct _cpt_object -+{ -+ struct list_head o_list; -+ struct list_head o_hash; -+ int o_count; -+ int o_index; -+ int o_lock; -+ loff_t o_pos; -+ loff_t o_ppos; -+ void *o_obj; -+ void *o_image; -+ void *o_parent; -+ struct list_head o_alist; -+} cpt_object_t; -+ -+struct cpt_context; -+ -+#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list) -+ -+ -+extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx); -+extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx); -+ -+cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx); -+cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx); -+cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx); -+ -+static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx) -+{ -+ cpt->o_pos = pos; -+ /* Add to pos hash table */ -+} -+ -+static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx) -+{ -+ cpt->o_obj = ptr; -+ /* Add to hash table */ -+} -+ -+static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx) -+{ -+ cpt->o_index = index; -+ /* Add to index hash table */ -+} -+ -+ -+extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx); -+extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx); -+extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx); -+extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx); -+extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx); -+ -+extern int cpt_object_init(struct cpt_context *ctx); -+extern int cpt_object_destroy(struct cpt_context *ctx); -+ -+#endif /* __CPT_OBJ_H_ */ -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_proc.c linux-2.6.16-026test009/kernel/cpt/cpt_proc.c ---- linux-2.6.16.orig/kernel/cpt/cpt_proc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_proc.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,577 @@ -+/* -+ * -+ * kernel/cpt/cpt_proc.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/errno.h> -+#include <linux/mm.h> -+#include <linux/list.h> -+#include <linux/proc_fs.h> -+#include <linux/smp_lock.h> -+#include <asm/uaccess.h> -+#include <linux/cpt_ioctl.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_dump.h" -+#include "cpt_mm.h" -+#include "cpt_kernel.h" -+ -+MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>"); -+MODULE_LICENSE("GPL"); -+ -+/* List of contexts and lock protecting the list */ -+struct list_head cpt_context_list; -+spinlock_t cpt_context_lock; -+ -+static int proc_read(char *buffer, char **start, off_t offset, -+ int length, int *eof, void *data) -+{ -+ off_t pos = 0; -+ off_t begin = 0; -+ int len = 0; -+ cpt_context_t *ctx; -+ -+ len += sprintf(buffer, "Ctx Id VE State\n"); -+ -+ spin_lock(&cpt_context_lock); -+ -+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { -+ len += sprintf(buffer+len,"%p %08x %-8u %d", -+ ctx, -+ ctx->contextid, -+ ctx->ve_id, -+ ctx->ctx_state -+ ); -+ -+ buffer[len++] = '\n'; -+ -+ pos = begin+len; -+ if (pos < offset) { -+ len = 0; -+ begin = pos; -+ } -+ if (pos > offset+length) -+ goto done; -+ } -+ *eof = 1; -+ -+done: -+ spin_unlock(&cpt_context_lock); -+ *start = buffer + (offset - begin); -+ len -= (offset - begin); -+ if(len > length) -+ len = length; -+ if(len < 0) -+ len = 0; -+ return len; -+} -+ -+void cpt_context_release(cpt_context_t *ctx) -+{ -+ list_del(&ctx->ctx_list); -+ spin_unlock(&cpt_context_lock); -+ -+ if (ctx->ctx_state > 0) -+ cpt_resume(ctx); -+ ctx->ctx_state = CPT_CTX_ERROR; -+ -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ if (ctx->pgin_task) -+ put_task_struct(ctx->pgin_task); -+ if (ctx->pgin_dir) -+ cpt_free_pgin_dir(ctx); -+ if (ctx->pagein_file_out) -+ fput(ctx->pagein_file_out); -+ if (ctx->pagein_file_in) -+ fput(ctx->pagein_file_in); -+#endif -+ if (ctx->objcount) -+ eprintk_ctx("%d objects leaked\n", ctx->objcount); -+ if (ctx->file) -+ fput(ctx->file); -+ cpt_flush_error(ctx); -+ if (ctx->errorfile) { -+ fput(ctx->errorfile); -+ ctx->errorfile = NULL; -+ } -+ if (ctx->error_msg) { -+ free_page((unsigned long)ctx->error_msg); -+ ctx->error_msg = NULL; -+ } -+ if (ctx->statusfile) -+ fput(ctx->statusfile); -+ if (ctx->lockfile) -+ fput(ctx->lockfile); -+ kfree(ctx); -+ -+ spin_lock(&cpt_context_lock); -+} -+ -+static void __cpt_context_put(cpt_context_t *ctx) -+{ -+ if (!--ctx->refcount) -+ cpt_context_release(ctx); -+} -+ -+static void cpt_context_put(cpt_context_t *ctx) -+{ -+ spin_lock(&cpt_context_lock); -+ __cpt_context_put(ctx); -+ spin_unlock(&cpt_context_lock); -+} -+ -+cpt_context_t * cpt_context_open(void) -+{ -+ cpt_context_t *ctx; -+ -+ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { -+ cpt_context_init(ctx); -+ spin_lock(&cpt_context_lock); -+ list_add_tail(&ctx->ctx_list, &cpt_context_list); -+ spin_unlock(&cpt_context_lock); -+ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); -+ if (ctx->error_msg != NULL) -+ ctx->error_msg[0] = 0; -+ } -+ return ctx; -+} -+ -+cpt_context_t * cpt_context_lookup(unsigned int contextid) -+{ -+ cpt_context_t *ctx; -+ -+ spin_lock(&cpt_context_lock); -+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { -+ if (ctx->contextid == contextid) { -+ ctx->refcount++; -+ spin_unlock(&cpt_context_lock); -+ return ctx; -+ } -+ } -+ spin_unlock(&cpt_context_lock); -+ return NULL; -+} -+ -+int cpt_context_lookup_veid(unsigned int veid) -+{ -+ cpt_context_t *ctx; -+ -+ spin_lock(&cpt_context_lock); -+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { -+ if (ctx->ve_id == veid && ctx->ctx_state > 0) { -+ spin_unlock(&cpt_context_lock); -+ return 1; -+ } -+ } -+ spin_unlock(&cpt_context_lock); -+ return 0; -+} -+ -+static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) -+{ -+ int err = 0; -+ cpt_context_t *ctx; -+ struct file *dfile = NULL; -+ -+ unlock_kernel(); -+ -+ if (cmd == CPT_VMPREP) { -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ err = cpt_mm_prepare(arg); -+#else -+ err = -EINVAL; -+#endif -+ goto out_lock; -+ } -+ -+ if (cmd == CPT_TEST_CAPS) { -+ unsigned int src_flags, dst_flags = arg; -+ -+ err = 0; -+ src_flags = test_cpu_caps(); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); -+ goto out_lock; -+ } -+ -+ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { -+ cpt_context_t *old_ctx; -+ -+ ctx = NULL; -+ if (cmd == CPT_JOIN_CONTEXT) { -+ err = -ENOENT; -+ ctx = cpt_context_lookup(arg); -+ if (!ctx) -+ goto out_lock; -+ } -+ -+ spin_lock(&cpt_context_lock); -+ old_ctx = (cpt_context_t*)file->private_data; -+ file->private_data = ctx; -+ -+ if (old_ctx) { -+ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { -+ old_ctx->sticky = 0; -+ old_ctx->refcount--; -+ } -+ __cpt_context_put(old_ctx); -+ } -+ spin_unlock(&cpt_context_lock); -+ err = 0; -+ goto out_lock; -+ } -+ -+ spin_lock(&cpt_context_lock); -+ ctx = (cpt_context_t*)file->private_data; -+ if (ctx) -+ ctx->refcount++; -+ spin_unlock(&cpt_context_lock); -+ -+ if (!ctx) { -+ cpt_context_t *old_ctx; -+ -+ err = -ENOMEM; -+ ctx = cpt_context_open(); -+ if (!ctx) -+ goto out_lock; -+ -+ spin_lock(&cpt_context_lock); -+ old_ctx = (cpt_context_t*)file->private_data; -+ if (!old_ctx) { -+ ctx->refcount++; -+ file->private_data = ctx; -+ } else { -+ old_ctx->refcount++; -+ } -+ if (old_ctx) { -+ __cpt_context_put(ctx); -+ ctx = old_ctx; -+ } -+ spin_unlock(&cpt_context_lock); -+ } -+ -+ if (cmd == CPT_GET_CONTEXT) { -+ unsigned int contextid = (unsigned int)arg; -+ -+ if (ctx->contextid && ctx->contextid != contextid) { -+ err = -EINVAL; -+ goto out_nosem; -+ } -+ if (!ctx->contextid) { -+ cpt_context_t *c1 = cpt_context_lookup(contextid); -+ if (c1) { -+ cpt_context_put(c1); -+ err = -EEXIST; -+ goto out_nosem; -+ } -+ ctx->contextid = contextid; -+ } -+ spin_lock(&cpt_context_lock); -+ if (!ctx->sticky) { -+ ctx->sticky = 1; -+ ctx->refcount++; -+ } -+ spin_unlock(&cpt_context_lock); -+ goto out_nosem; -+ } -+ -+ down(&ctx->main_sem); -+ -+ err = -EBUSY; -+ if (ctx->ctx_state < 0) -+ goto out; -+ -+ err = 0; -+ switch (cmd) { -+ case CPT_SET_DUMPFD: -+ if (ctx->ctx_state == CPT_CTX_DUMPING) { -+ err = -EBUSY; -+ break; -+ } -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ if (dfile->f_op == NULL || -+ dfile->f_op->write == NULL) { -+ fput(dfile); -+ err = -EBADF; -+ break; -+ } -+ } -+ if (ctx->file) -+ fput(ctx->file); -+ ctx->file = dfile; -+ break; -+ case CPT_SET_ERRORFD: -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ } -+ if (ctx->errorfile) -+ fput(ctx->errorfile); -+ ctx->errorfile = dfile; -+ break; -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ case CPT_SET_PAGEINFDIN: -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ } -+ if (ctx->pagein_file_in) -+ fput(ctx->pagein_file_in); -+ ctx->pagein_file_in = dfile; -+ break; -+ case CPT_SET_PAGEINFDOUT: -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ } -+ if (ctx->pagein_file_out) -+ fput(ctx->pagein_file_out); -+ ctx->pagein_file_out = dfile; -+ break; -+ case CPT_SET_LAZY: -+ ctx->lazy_vm = arg; -+ break; -+ case CPT_PAGEIND: -+ err = cpt_start_pagein(ctx); -+ break; -+#endif -+ case CPT_SET_VEID: -+ if (ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ ctx->ve_id = arg; -+ break; -+ case CPT_SET_CPU_FLAGS: -+ if (ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ ctx->dst_cpu_flags = arg; -+ ctx->src_cpu_flags = test_cpu_caps(); -+ break; -+ case CPT_SUSPEND: -+ if (cpt_context_lookup_veid(ctx->ve_id) || -+ ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ ctx->ctx_state = CPT_CTX_SUSPENDING; -+ err = cpt_vps_suspend(ctx); -+ if (err) { -+ if (cpt_resume(ctx) == 0) -+ ctx->ctx_state = CPT_CTX_IDLE; -+ } else { -+ ctx->ctx_state = CPT_CTX_SUSPENDED; -+ } -+ break; -+ case CPT_DUMP: -+ if (!ctx->ctx_state) { -+ err = -ENOENT; -+ break; -+ } -+ err = cpt_dump(ctx); -+ break; -+ case CPT_RESUME: -+ if (ctx->ctx_state == CPT_CTX_IDLE) { -+ err = -ENOENT; -+ break; -+ } -+ err = cpt_resume(ctx); -+ if (!err) -+ ctx->ctx_state = CPT_CTX_IDLE; -+ break; -+ case CPT_KILL: -+ if (ctx->ctx_state == CPT_CTX_IDLE) { -+ err = -ENOENT; -+ break; -+ } -+ err = cpt_kill(ctx); -+ if (!err) -+ ctx->ctx_state = CPT_CTX_IDLE; -+ break; -+ case CPT_TEST_VECAPS: -+ { -+ __u32 dst_flags = arg; -+ __u32 src_flags; -+ -+ err = cpt_vps_caps(ctx, &src_flags); -+ if (err) -+ break; -+ -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err); -+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err); -+ break; -+ } -+ default: -+ err = -EINVAL; -+ break; -+ } -+ -+out: -+ cpt_flush_error(ctx); -+ up(&ctx->main_sem); -+out_nosem: -+ cpt_context_put(ctx); -+out_lock: -+ lock_kernel(); -+ return err; -+} -+ -+static int cpt_open(struct inode *inode, struct file *file) -+{ -+ if (!try_module_get(THIS_MODULE)) -+ return -EBUSY; -+ -+ return 0; -+} -+ -+static int cpt_release(struct inode * inode, struct file * file) -+{ -+ cpt_context_t *ctx; -+ -+ spin_lock(&cpt_context_lock); -+ ctx = (cpt_context_t*)file->private_data; -+ file->private_data = NULL; -+ -+ if (ctx) -+ __cpt_context_put(ctx); -+ spin_unlock(&cpt_context_lock); -+ -+ module_put(THIS_MODULE); -+ return 0; -+} -+ -+ -+static struct file_operations cpt_fops = { -+ .owner = THIS_MODULE, -+ .open = cpt_open, -+ .release = cpt_release, -+ .ioctl = cpt_ioctl, -+}; -+ -+static struct proc_dir_entry *proc_ent; -+ -+int debug_level = 1; -+ -+static struct ctl_table_header *ctl_header; -+ -+static ctl_table debug_table[] = { -+ { -+ .ctl_name = 9475, -+ .procname = "cpt", -+ .data = &debug_level, -+ .maxlen = sizeof(debug_level), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { .ctl_name = 0 } -+}; -+static ctl_table root_table[] = { -+ { -+ .ctl_name = CTL_DEBUG, -+ .procname = "debug", -+ .mode = 0555, -+ .child = debug_table, -+ }, -+ { .ctl_name = 0 } -+}; -+ -+static int __init init_cpt(void) -+{ -+ int err; -+ -+ err = -ENOMEM; -+ ctl_header = register_sysctl_table(root_table, 0); -+ if (!ctl_header) -+ goto err_mon; -+ -+ spin_lock_init(&cpt_context_lock); -+ INIT_LIST_HEAD(&cpt_context_list); -+ -+ err = -EINVAL; -+ proc_ent = create_proc_entry("cpt", 0600, NULL); -+ if (!proc_ent) -+ goto err_out; -+ -+ cpt_fops.read = proc_ent->proc_fops->read; -+ cpt_fops.write = proc_ent->proc_fops->write; -+ cpt_fops.llseek = proc_ent->proc_fops->llseek; -+ proc_ent->proc_fops = &cpt_fops; -+ -+ proc_ent->read_proc = proc_read; -+ proc_ent->data = NULL; -+ proc_ent->owner = THIS_MODULE; -+ return 0; -+ -+err_out: -+ unregister_sysctl_table(ctl_header); -+err_mon: -+ return err; -+} -+module_init(init_cpt); -+ -+static void __exit exit_cpt(void) -+{ -+ remove_proc_entry("cpt", NULL); -+ unregister_sysctl_table(ctl_header); -+ -+ spin_lock(&cpt_context_lock); -+ while (!list_empty(&cpt_context_list)) { -+ cpt_context_t *ctx; -+ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); -+ -+ if (!ctx->sticky) -+ ctx->refcount++; -+ ctx->sticky = 0; -+ -+ BUG_ON(ctx->refcount != 1); -+ -+ __cpt_context_put(ctx); -+ } -+ spin_unlock(&cpt_context_lock); -+} -+module_exit(exit_cpt); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_process.c linux-2.6.16-026test009/kernel/cpt/cpt_process.c ---- linux-2.6.16.orig/kernel/cpt/cpt_process.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_process.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,964 @@ -+/* -+ * -+ * kernel/cpt/cpt_process.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/compat.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_ubc.h" -+#include "cpt_process.h" -+#include "cpt_kernel.h" -+ -+#ifdef CONFIG_X86_32 -+#undef task_pt_regs -+#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1) -+#endif -+ -+static u32 encode_segment(u32 segreg) -+{ -+ segreg &= 0xFFFF; -+ -+ if (segreg == 0) -+ return CPT_SEG_ZERO; -+ if ((segreg & 3) != 3) { -+ wprintk("Invalid RPL of a segment reg %x\n", segreg); -+ return CPT_SEG_ZERO; -+ } -+ -+ /* LDT descriptor, it is just an index to LDT array */ -+ if (segreg & 4) -+ return CPT_SEG_LDT + (segreg >> 3); -+ -+ /* TLS descriptor. */ -+ if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN && -+ (segreg >> 3) <= GDT_ENTRY_TLS_MAX) -+ return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN); -+ -+ /* One of standard desriptors */ -+#ifdef CONFIG_X86_64 -+ if (segreg == __USER32_DS) -+ return CPT_SEG_USER32_DS; -+ if (segreg == __USER32_CS) -+ return CPT_SEG_USER32_CS; -+ if (segreg == __USER_DS) -+ return CPT_SEG_USER64_DS; -+ if (segreg == __USER_CS) -+ return CPT_SEG_USER64_CS; -+#else -+ if (segreg == __USER_DS) -+ return CPT_SEG_USER32_DS; -+ if (segreg == __USER_CS) -+ return CPT_SEG_USER32_CS; -+#endif -+ wprintk("Invalid segment reg %x\n", segreg); -+ return CPT_SEG_ZERO; -+} -+ -+#ifdef CONFIG_X86_64 -+static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s, task_t *tsk) -+{ -+ d->cpt_ebp = s->rbp; -+ d->cpt_ebx = s->rbx; -+ d->cpt_eax = s->rax; -+ d->cpt_ecx = s->rcx; -+ d->cpt_edx = s->rdx; -+ d->cpt_esi = s->rsi; -+ d->cpt_edi = s->rdi; -+ d->cpt_orig_eax = s->orig_rax; -+ d->cpt_eip = s->rip; -+ d->cpt_xcs = encode_segment(s->cs); -+ d->cpt_eflags = s->eflags; -+ d->cpt_esp = s->rsp; -+ d->cpt_xss = encode_segment(s->ss); -+ d->cpt_xds = encode_segment(tsk->thread.ds); -+ d->cpt_xes = encode_segment(tsk->thread.es); -+} -+ -+static int dump_registers(task_t *tsk, struct cpt_context *ctx) -+{ -+ cpt_open_object(NULL, ctx); -+ -+ if (tsk->thread_info->flags&_TIF_IA32) { -+ struct cpt_x86_regs ri; -+ ri.cpt_next = sizeof(ri); -+ ri.cpt_object = CPT_OBJ_X86_REGS; -+ ri.cpt_hdrlen = sizeof(ri); -+ ri.cpt_content = CPT_CONTENT_VOID; -+ -+ ri.cpt_debugreg[0] = tsk->thread.debugreg0; -+ ri.cpt_debugreg[1] = tsk->thread.debugreg1; -+ ri.cpt_debugreg[2] = tsk->thread.debugreg2; -+ ri.cpt_debugreg[3] = tsk->thread.debugreg3; -+ ri.cpt_debugreg[4] = 0; -+ ri.cpt_debugreg[5] = 0; -+ ri.cpt_debugreg[6] = tsk->thread.debugreg6; -+ ri.cpt_debugreg[7] = tsk->thread.debugreg7; -+ ri.cpt_fs = encode_segment(tsk->thread.fsindex); -+ ri.cpt_gs = encode_segment(tsk->thread.gsindex); -+ -+ xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk); -+ -+ ctx->write(&ri, sizeof(ri), ctx); -+ } else { -+ struct cpt_x86_64_regs ri; -+ ri.cpt_next = sizeof(ri); -+ ri.cpt_object = CPT_OBJ_X86_64_REGS; -+ ri.cpt_hdrlen = sizeof(ri); -+ ri.cpt_content = CPT_CONTENT_VOID; -+ -+ ri.cpt_fsbase = tsk->thread.fs; -+ ri.cpt_gsbase = tsk->thread.gs; -+ ri.cpt_fsindex = encode_segment(tsk->thread.fsindex); -+ ri.cpt_gsindex = encode_segment(tsk->thread.gsindex); -+ ri.cpt_ds = encode_segment(tsk->thread.ds); -+ ri.cpt_es = encode_segment(tsk->thread.es); -+ ri.cpt_debugreg[0] = tsk->thread.debugreg0; -+ ri.cpt_debugreg[1] = tsk->thread.debugreg1; -+ ri.cpt_debugreg[2] = tsk->thread.debugreg2; -+ ri.cpt_debugreg[3] = tsk->thread.debugreg3; -+ ri.cpt_debugreg[4] = 0; -+ ri.cpt_debugreg[5] = 0; -+ ri.cpt_debugreg[6] = tsk->thread.debugreg6; -+ ri.cpt_debugreg[7] = tsk->thread.debugreg7; -+ -+ memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs)); -+ -+ ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs); -+ ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss); -+ -+ ctx->write(&ri, sizeof(ri), ctx); -+ -+#if 0 -+ if (ri.cpt_rip >= VSYSCALL_START && ri.cpt_rip < VSYSCALL_END) { -+ eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk)); -+ return -EAGAIN; -+ } -+#endif -+ } -+ cpt_close_object(ctx); -+ -+ return 0; -+} -+ -+#else -+ -+static int dump_registers(task_t *tsk, struct cpt_context *ctx) -+{ -+ struct cpt_x86_regs ri; -+ -+ cpt_open_object(NULL, ctx); -+ -+ ri.cpt_next = sizeof(ri); -+ ri.cpt_object = CPT_OBJ_X86_REGS; -+ ri.cpt_hdrlen = sizeof(ri); -+ ri.cpt_content = CPT_CONTENT_VOID; -+ -+ ri.cpt_debugreg[0] = tsk->thread.debugreg[0]; -+ ri.cpt_debugreg[1] = tsk->thread.debugreg[1]; -+ ri.cpt_debugreg[2] = tsk->thread.debugreg[2]; -+ ri.cpt_debugreg[3] = tsk->thread.debugreg[3]; -+ ri.cpt_debugreg[4] = tsk->thread.debugreg[4]; -+ ri.cpt_debugreg[5] = tsk->thread.debugreg[5]; -+ ri.cpt_debugreg[6] = tsk->thread.debugreg[6]; -+ ri.cpt_debugreg[7] = tsk->thread.debugreg[7]; -+ ri.cpt_fs = encode_segment(tsk->thread.fs); -+ ri.cpt_gs = encode_segment(tsk->thread.gs); -+ -+ memcpy(&ri.cpt_ebx, task_pt_regs(tsk), sizeof(struct pt_regs)); -+ -+ ri.cpt_xcs = encode_segment(task_pt_regs(tsk)->xcs); -+ ri.cpt_xss = encode_segment(task_pt_regs(tsk)->xss); -+ ri.cpt_xds = encode_segment(task_pt_regs(tsk)->xds); -+ ri.cpt_xes = encode_segment(task_pt_regs(tsk)->xes); -+ -+ ctx->write(&ri, sizeof(ri), ctx); -+ cpt_close_object(ctx); -+ -+ return 0; -+} -+#endif -+ -+static int dump_kstack(task_t *tsk, struct cpt_context *ctx) -+{ -+ struct cpt_obj_bits hdr; -+ unsigned long size; -+ void *start; -+ -+ cpt_open_object(NULL, ctx); -+ -+#ifdef CONFIG_X86_64 -+ size = tsk->thread.rsp0 - tsk->thread.rsp; -+ start = (void*)tsk->thread.rsp; -+#else -+ size = tsk->thread.esp0 - tsk->thread.esp; -+ start = (void*)tsk->thread.esp; -+#endif -+ -+ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); -+ hdr.cpt_object = CPT_OBJ_BITS; -+ hdr.cpt_hdrlen = sizeof(hdr); -+ hdr.cpt_content = CPT_CONTENT_STACK; -+ hdr.cpt_size = size; -+ -+ ctx->write(&hdr, sizeof(hdr), ctx); -+ ctx->write(start, size, ctx); -+ ctx->align(ctx); -+ cpt_close_object(ctx); -+ return 0; -+} -+ -+/* Formats of i387_fxsave_struct are the same for x86_64 -+ * and i386. Plain luck. */ -+ -+static int dump_fpustate(task_t *tsk, struct cpt_context *ctx) -+{ -+ struct cpt_obj_bits hdr; -+ unsigned long size; -+ int type; -+ -+ cpt_open_object(NULL, ctx); -+ -+ type = CPT_CONTENT_X86_FPUSTATE; -+ size = sizeof(struct i387_fxsave_struct); -+#ifndef CONFIG_X86_64 -+ if (!cpu_has_fxsr) { -+ size = sizeof(struct i387_fsave_struct); -+ type = CPT_CONTENT_X86_FPUSTATE_OLD; -+ } -+#endif -+ -+ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); -+ hdr.cpt_object = CPT_OBJ_BITS; -+ hdr.cpt_hdrlen = sizeof(hdr); -+ hdr.cpt_content = type; -+ hdr.cpt_size = size; -+ -+ ctx->write(&hdr, sizeof(hdr), ctx); -+ ctx->write(&tsk->thread.i387, size, ctx); -+ ctx->align(ctx); -+ cpt_close_object(ctx); -+ return 0; -+} -+ -+static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info) -+{ -+ si->cpt_signo = info->si_signo; -+ si->cpt_errno = info->si_errno; -+ si->cpt_code = info->si_code; -+ -+ switch(si->cpt_code & __SI_MASK) { -+ case __SI_TIMER: -+ si->cpt_pid = info->si_tid; -+ si->cpt_uid = info->si_overrun; -+ si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr); -+ si->cpt_utime = info->si_sys_private; -+ break; -+ case __SI_POLL: -+ si->cpt_pid = info->si_band; -+ si->cpt_uid = info->si_fd; -+ break; -+ case __SI_FAULT: -+ si->cpt_sigval = cpt_ptr_export(info->si_addr); -+#ifdef __ARCH_SI_TRAPNO -+ si->cpt_pid = info->si_trapno; -+#endif -+ break; -+ case __SI_CHLD: -+ si->cpt_pid = is_virtual_pid(info->si_pid) ? info->si_pid : pid_type_to_vpid(PIDTYPE_PID, info->si_pid); -+ si->cpt_uid = info->si_uid; -+ si->cpt_sigval = info->si_status; -+ si->cpt_stime = info->si_stime; -+ si->cpt_utime = info->si_utime; -+ break; -+ case __SI_KILL: -+ case __SI_RT: -+ case __SI_MESGQ: -+ default: -+ si->cpt_pid = is_virtual_pid(info->si_pid) ? info->si_pid : pid_type_to_vpid(PIDTYPE_TGID, info->si_pid); -+ si->cpt_uid = info->si_uid; -+ si->cpt_sigval = cpt_ptr_export(info->si_ptr); -+ break; -+ } -+ return 0; -+} -+ -+static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx) -+{ -+ struct sigqueue *q; -+ loff_t saved_obj; -+ -+ if (list_empty(&list->list)) -+ return 0; -+ -+ cpt_push_object(&saved_obj, ctx); -+ list_for_each_entry(q, &list->list, list) { -+ struct cpt_siginfo_image si; -+ -+ si.cpt_next = sizeof(si); -+ si.cpt_object = CPT_OBJ_SIGINFO; -+ si.cpt_hdrlen = sizeof(si); -+ si.cpt_content = CPT_CONTENT_VOID; -+ -+ si.cpt_qflags = q->flags; -+ si.cpt_user = q->user->uid; -+ -+ if (encode_siginfo(&si, &q->info)) -+ return -EINVAL; -+ -+ ctx->write(&si, sizeof(si), ctx); -+ } -+ cpt_pop_object(&saved_obj, ctx); -+ return 0; -+} -+ -+ -+ -+static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct signal_struct *sig = obj->o_obj; -+ struct cpt_signal_image *v = cpt_get_buf(ctx); -+ task_t *tsk; -+ int i; -+ -+ cpt_open_object(obj, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_SIGNAL_STRUCT; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ if (sig->pgrp <= 0) { -+ eprintk_ctx("bad pgid\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ v->cpt_pgrp_type = CPT_PGRP_NORMAL; -+ read_lock(&tasklist_lock); -+ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->pgrp); -+ if (tsk == NULL) -+ v->cpt_pgrp_type = CPT_PGRP_ORPHAN; -+ read_unlock(&tasklist_lock); -+ v->cpt_pgrp = pid_type_to_vpid(PIDTYPE_PGID, sig->pgrp); -+ -+ v->cpt_old_pgrp = 0; -+ if (sig->tty_old_pgrp < 0) { -+ eprintk_ctx("bad tty_old_pgrp\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ if (sig->tty_old_pgrp > 0) { -+ v->cpt_old_pgrp_type = CPT_PGRP_NORMAL; -+ read_lock(&tasklist_lock); -+ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->tty_old_pgrp); -+ if (tsk == NULL) { -+ v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN; -+ tsk = find_task_by_pid_type_ve(PIDTYPE_PGID, sig->tty_old_pgrp); -+ } -+ read_unlock(&tasklist_lock); -+ if (tsk == NULL) { -+ eprintk_ctx("tty_old_pgrp does not exist anymore\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ v->cpt_old_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, sig->tty_old_pgrp); -+ if ((int)v->cpt_old_pgrp < 0) { -+ dprintk_ctx("stray tty_old_pgrp %d\n", sig->tty_old_pgrp); -+ v->cpt_old_pgrp = -1; -+ v->cpt_old_pgrp_type = CPT_PGRP_STRAY; -+ } -+ } -+ -+ if (sig->session <= 0) { -+ eprintk_ctx("bad session\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ v->cpt_session_type = CPT_PGRP_NORMAL; -+ read_lock(&tasklist_lock); -+ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->session); -+ if (tsk == NULL) -+ v->cpt_session_type = CPT_PGRP_ORPHAN; -+ read_unlock(&tasklist_lock); -+ v->cpt_session = pid_type_to_vpid(PIDTYPE_SID, sig->session); -+ -+ v->cpt_leader = sig->leader; -+ v->cpt_ctty = CPT_NULL; -+ if (sig->tty) { -+ cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx); -+ if (cobj) -+ v->cpt_ctty = cobj->o_pos; -+ else { -+ eprintk_ctx("controlling tty is not found\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ } -+ memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8); -+ -+ v->cpt_curr_target = 0; -+ if (sig->curr_target) -+ v->cpt_curr_target = virt_pid(sig->curr_target); -+ v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0); -+ v->cpt_group_exit_code = sig->group_exit_code; -+ v->cpt_group_exit_task = 0; -+ if (sig->group_exit_task) -+ v->cpt_group_exit_task = virt_pid(sig->group_exit_task); -+ v->cpt_notify_count = sig->notify_count; -+ v->cpt_group_stop_count = sig->group_stop_count; -+ -+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8) -+ v->cpt_utime = sig->utime; -+ v->cpt_stime = sig->stime; -+ v->cpt_cutime = sig->cutime; -+ v->cpt_cstime = sig->cstime; -+ v->cpt_nvcsw = sig->nvcsw; -+ v->cpt_nivcsw = sig->nivcsw; -+ v->cpt_cnvcsw = sig->cnvcsw; -+ v->cpt_cnivcsw = sig->cnivcsw; -+ v->cpt_min_flt = sig->min_flt; -+ v->cpt_maj_flt = sig->maj_flt; -+ v->cpt_cmin_flt = sig->cmin_flt; -+ v->cpt_cmaj_flt = sig->cmaj_flt; -+ -+ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) -+ __asm__("undefined\n"); -+ -+ for (i=0; i<CPT_RLIM_NLIMITS; i++) { -+ if (i < RLIM_NLIMITS) { -+ v->cpt_rlim_cur[i] = sig->rlim[i].rlim_cur; -+ v->cpt_rlim_max[i] = sig->rlim[i].rlim_max; -+ } else { -+ v->cpt_rlim_cur[i] = CPT_NULL; -+ v->cpt_rlim_max[i] = CPT_NULL; -+ } -+ } -+#endif -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ dump_sigqueue(&sig->shared_pending, ctx); -+ -+ cpt_close_object(ctx); -+ return 0; -+} -+ -+ -+static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ task_t *tsk = obj->o_obj; -+ task_t *next_tsk; -+ struct cpt_task_image *v = cpt_get_buf(ctx); -+ cpt_object_t *tobj; -+ cpt_object_t *tg_obj; -+ loff_t saved_obj; -+ int i; -+ int err; -+ struct timespec delta; -+ -+ cpt_open_object(obj, ctx); -+ -+ v->cpt_signal = CPT_NULL; -+ tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx); -+ if (!tg_obj) BUG(); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_TASK; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_state = tsk->state; -+ if (tsk->state == EXIT_ZOMBIE) { -+ eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk)); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } else if (tsk->state == EXIT_DEAD) { -+ if (tsk->exit_state != EXIT_DEAD && -+ tsk->exit_state != EXIT_ZOMBIE) { -+ eprintk_ctx("invalid exit_state %ld on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk)); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ } -+ if (tsk->exit_state) { -+ v->cpt_state = tsk->exit_state; -+ if (tsk->state != EXIT_DEAD) { -+ eprintk_ctx("invalid tsk->state %ld/%ld on" CPT_FID "\n", -+ tsk->state, tsk->exit_state, CPT_TID(tsk)); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ } -+ v->cpt_flags = tsk->flags&~PF_FROZEN; -+ v->cpt_ptrace = tsk->ptrace; -+ v->cpt_prio = tsk->prio; -+ v->cpt_exit_code = tsk->exit_code; -+ v->cpt_exit_signal = tsk->exit_signal; -+ v->cpt_pdeath_signal = tsk->pdeath_signal; -+ v->cpt_static_prio = tsk->static_prio; -+ v->cpt_rt_priority = tsk->rt_priority; -+ v->cpt_policy = tsk->policy; -+ if (v->cpt_policy != SCHED_NORMAL) { -+ eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ -+ v->cpt_mm = CPT_NULL; -+ if (tsk->mm) { -+ tobj = lookup_cpt_object(CPT_OBJ_MM, tsk->mm, ctx); -+ if (!tobj) BUG(); -+ v->cpt_mm = tobj->o_pos; -+ } -+ v->cpt_files = CPT_NULL; -+ if (tsk->files) { -+ tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk->files, ctx); -+ if (!tobj) BUG(); -+ v->cpt_files = tobj->o_pos; -+ } -+ v->cpt_fs = CPT_NULL; -+ if (tsk->fs) { -+ tobj = lookup_cpt_object(CPT_OBJ_FS, tsk->fs, ctx); -+ if (!tobj) BUG(); -+ v->cpt_fs = tobj->o_pos; -+ } -+ v->cpt_namespace = CPT_NULL; -+ if (tsk->namespace) { -+ tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk->namespace, ctx); -+ if (!tobj) BUG(); -+ v->cpt_namespace = tobj->o_pos; -+ -+ if (tsk->namespace != current->namespace) -+ eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm); -+ } -+ v->cpt_sysvsem_undo = CPT_NULL; -+ if (tsk->sysvsem.undo_list && !tsk->exit_state) { -+ tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx); -+ if (!tobj) BUG(); -+ v->cpt_sysvsem_undo = tobj->o_pos; -+ } -+ v->cpt_sighand = CPT_NULL; -+ if (tsk->sighand) { -+ tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx); -+ if (!tobj) BUG(); -+ v->cpt_sighand = tobj->o_pos; -+ } -+ v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked); -+ v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked); -+ v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask); -+ -+ v->cpt_pid = virt_pid(tsk); -+ v->cpt_tgid = virt_tgid(tsk); -+ v->cpt_ppid = 0; -+ if (tsk->parent) { -+ if (tsk->parent != tsk->real_parent && -+ !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) { -+ eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, virt_pid(tsk), tsk->comm); -+ cpt_release_buf(ctx); -+ return -EBUSY; -+ } -+ v->cpt_ppid = virt_pid(tsk->parent); -+ } -+ v->cpt_rppid = tsk->real_parent ? virt_pid(tsk->real_parent) : 0; -+ v->cpt_pgrp = virt_pgid(tsk); -+ v->cpt_session = virt_sid(tsk); -+ v->cpt_old_pgrp = 0; -+ if (tsk->signal->tty_old_pgrp) -+ v->cpt_old_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, tsk->signal->tty_old_pgrp); -+ v->cpt_leader = tsk->group_leader ? virt_pid(tsk->group_leader) : 0; -+ v->cpt_set_tid = (unsigned long)tsk->set_child_tid; -+ v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid; -+ memcpy(v->cpt_comm, tsk->comm, 16); -+ v->cpt_user = tsk->user->uid; -+ v->cpt_uid = tsk->uid; -+ v->cpt_euid = tsk->euid; -+ v->cpt_suid = tsk->suid; -+ v->cpt_fsuid = tsk->fsuid; -+ v->cpt_gid = tsk->gid; -+ v->cpt_egid = tsk->egid; -+ v->cpt_sgid = tsk->sgid; -+ v->cpt_fsgid = tsk->fsgid; -+ v->cpt_ngids = 0; -+ if (tsk->group_info && tsk->group_info->ngroups != 0) { -+ int i = tsk->group_info->ngroups; -+ if (i > 32) { -+ /* Shame... I did a simplified version and _forgot_ -+ * about this. Later, later. */ -+ eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk)); -+ return -EINVAL; -+ } -+ v->cpt_ngids = i; -+ for (i--; i>=0; i--) -+ v->cpt_gids[i] = tsk->group_info->small_block[i]; -+ } -+ memcpy(&v->cpt_ecap, &tsk->cap_effective, 8); -+ memcpy(&v->cpt_icap, &tsk->cap_inheritable, 8); -+ memcpy(&v->cpt_pcap, &tsk->cap_permitted, 8); -+ v->cpt_keepcap = tsk->keep_capabilities; -+ -+ v->cpt_did_exec = tsk->did_exec; -+ v->cpt_exec_domain = -1; -+ v->cpt_thrflags = tsk->thread_info->flags & ~(1<<TIF_FREEZE); -+ v->cpt_64bit = 0; -+#ifdef CONFIG_X86_64 -+ /* Clear x86_64 specific flags */ -+ v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); -+ if (!(tsk->thread_info->flags & _TIF_IA32)) { -+ ctx->tasks64++; -+ v->cpt_64bit = 1; -+ } -+#endif -+ v->cpt_thrstatus = tsk->thread_info->status; -+ v->cpt_addr_limit = -1; -+ -+ v->cpt_personality = tsk->personality; -+ -+ for (i=0; i<GDT_ENTRY_TLS_ENTRIES; i++) { -+ if (i>=3) { -+ eprintk_ctx("too many tls descs\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+#ifndef CONFIG_X86_64 -+ v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a; -+#else -+ v->cpt_tls[i] = tsk->thread.tls_array[i]; -+#endif -+ } -+ -+ v->cpt_restart.fn = CPT_RBL_0; -+ if (tsk->thread_info->restart_block.fn != current->thread_info->restart_block.fn) { -+ if (tsk->thread_info->restart_block.fn != nanosleep_restart -+#ifdef CONFIG_X86_64 -+ && tsk->thread_info->restart_block.fn != compat_nanosleep_restart -+#endif -+ ) { -+ eprintk_ctx("unknown restart block %p\n", tsk->thread_info->restart_block.fn); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ v->cpt_restart.fn = CPT_RBL_NANOSLEEP; -+#ifdef CONFIG_X86_64 -+ if (tsk->thread_info->restart_block.fn == compat_nanosleep_restart) -+ v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP; -+#endif -+ v->cpt_restart.arg0 = tsk->thread_info->restart_block.arg0; -+ v->cpt_restart.arg1 = tsk->thread_info->restart_block.arg1; -+ v->cpt_restart.arg2 = tsk->thread_info->restart_block.arg2; -+ v->cpt_restart.arg3 = tsk->thread_info->restart_block.arg3; -+ if (debug_level > 2) { -+ ktime_t e, e1; -+ struct timespec now; -+ -+ do_posix_clock_monotonic_gettime(&now); -+ e = timespec_to_ktime(now); -+ e1.tv64 = ((u64)tsk->thread_info->restart_block.arg1 << 32) | (u64) tsk->thread_info->restart_block.arg0; -+ e = ktime_sub(e1, e); -+ dprintk("cpt " CPT_FID " RBL %ld/%ld %Ld\n", CPT_TID(tsk), -+ tsk->thread_info->restart_block.arg1, -+ tsk->thread_info->restart_block.arg0, e.tv64); -+ } -+ } -+ -+ v->cpt_it_real_incr = 0; -+ v->cpt_it_prof_incr = 0; -+ v->cpt_it_virt_incr = 0; -+ v->cpt_it_real_value = 0; -+ v->cpt_it_prof_value = 0; -+ v->cpt_it_virt_value = 0; -+ if (thread_group_leader(tsk) && tsk->exit_state == 0) { -+ ktime_t rem; -+ -+ v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr); -+ v->cpt_it_prof_incr = tsk->signal->it_prof_incr; -+ v->cpt_it_virt_incr = tsk->signal->it_virt_incr; -+ -+ rem = hrtimer_get_remaining(&tsk->signal->real_timer); -+ -+ if (hrtimer_active(&tsk->signal->real_timer)) { -+ if (rem.tv64 <= 0) -+ rem.tv64 = NSEC_PER_USEC; -+ v->cpt_it_real_value = ktime_to_ns(rem); -+ dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), v->cpt_it_real_value); -+ } -+ v->cpt_it_prof_value = tsk->signal->it_prof_expires; -+ v->cpt_it_virt_value = tsk->signal->it_virt_expires; -+ } -+ v->cpt_used_math = (tsk_used_math(tsk) != 0); -+ -+ if (tsk->notifier) { -+ eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ -+ v->cpt_utime = tsk->utime; -+ v->cpt_stime = tsk->stime; -+ delta = tsk->start_time; -+ _set_normalized_timespec(&delta, -+ delta.tv_sec - get_exec_env()->init_entry->start_time.tv_sec, -+ delta.tv_nsec - get_exec_env()->init_entry->start_time.tv_nsec); -+ v->cpt_starttime = cpt_timespec_export(&delta); -+ v->cpt_nvcsw = tsk->nvcsw; -+ v->cpt_nivcsw = tsk->nivcsw; -+ v->cpt_min_flt = tsk->min_flt; -+ v->cpt_maj_flt = tsk->maj_flt; -+ -+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) -+ v->cpt_cutime = tsk->cutime; -+ v->cpt_cstime = tsk->cstime; -+ v->cpt_cnvcsw = tsk->cnvcsw; -+ v->cpt_cnivcsw = tsk->cnivcsw; -+ v->cpt_cmin_flt = tsk->cmin_flt; -+ v->cpt_cmaj_flt = tsk->cmaj_flt; -+ -+ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) -+ __asm__("undefined\n"); -+ -+ for (i=0; i<CPT_RLIM_NLIMITS; i++) { -+ if (i < RLIM_NLIMITS) { -+ v->cpt_rlim_cur[i] = tsk->rlim[i].rlim_cur; -+ v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max; -+ } else { -+ v->cpt_rlim_cur[i] = CPT_NULL; -+ v->cpt_rlim_max[i] = CPT_NULL; -+ } -+ } -+#else -+ v->cpt_cutime = tsk->signal->cutime; -+ v->cpt_cstime = tsk->signal->cstime; -+ v->cpt_cnvcsw = tsk->signal->cnvcsw; -+ v->cpt_cnivcsw = tsk->signal->cnivcsw; -+ v->cpt_cmin_flt = tsk->signal->cmin_flt; -+ v->cpt_cmaj_flt = tsk->signal->cmaj_flt; -+ -+ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) -+ __asm__("undefined\n"); -+ -+ for (i=0; i<CPT_RLIM_NLIMITS; i++) { -+ if (i < RLIM_NLIMITS) { -+ v->cpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur; -+ v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max; -+ } else { -+ v->cpt_rlim_cur[i] = CPT_NULL; -+ v->cpt_rlim_max[i] = CPT_NULL; -+ } -+ } -+#endif -+ -+ if (tsk->mm) -+ v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx); -+ else -+ v->cpt_mm_ub = CPT_NULL; -+ v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx); -+ v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx); -+ v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx); -+ -+ v->cpt_ptrace_message = tsk->ptrace_message; -+ v->cpt_pn_state = tsk->pn_state; -+ v->cpt_stopped_state = tsk->stopped_state; -+ v->cpt_sigsuspend_state = 0; -+ -+#ifndef CONFIG_X86_64 -+ if (tsk->thread.vm86_info) { -+ eprintk_ctx("vm86 task is running\n"); -+ cpt_release_buf(ctx); -+ return -EBUSY; -+ } -+#endif -+ -+ v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal); -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ dump_kstack(tsk, ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ err = dump_registers(tsk, ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ if (err) -+ return err; -+ -+ if (tsk_used_math(tsk)) { -+ cpt_push_object(&saved_obj, ctx); -+ dump_fpustate(tsk, ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ } -+ -+ if (tsk->last_siginfo) { -+ struct cpt_siginfo_image si; -+ cpt_push_object(&saved_obj, ctx); -+ -+ si.cpt_next = sizeof(si); -+ si.cpt_object = CPT_OBJ_LASTSIGINFO; -+ si.cpt_hdrlen = sizeof(si); -+ si.cpt_content = CPT_CONTENT_VOID; -+ -+ if (encode_siginfo(&si, tsk->last_siginfo)) -+ return -EINVAL; -+ -+ ctx->write(&si, sizeof(si), ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ } -+ -+ dump_sigqueue(&tsk->pending, ctx); -+ -+ next_tsk = NULL; -+ if (obj->o_list.next != &ctx->object_array[CPT_OBJ_TASK]) { -+ tobj = list_entry(obj->o_list.next, cpt_object_t, o_list); -+ next_tsk = tobj->o_obj; -+ if (next_tsk->tgid != tsk->tgid) -+ next_tsk = NULL; -+ } -+ -+ if (next_tsk == NULL) { -+ int err; -+ loff_t pos = ctx->file->f_pos; -+ -+ cpt_push_object(&saved_obj, ctx); -+ err = dump_one_signal_struct(tg_obj, ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ if (err) -+ return err; -+ -+ for (;;) { -+ task_t *prev_tsk; -+ loff_t tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal); -+ -+ ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos); -+ -+ if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) -+ break; -+ -+ tobj = list_entry(obj->o_list.prev, cpt_object_t, o_list); -+ prev_tsk = tobj->o_obj; -+ if (prev_tsk->tgid != tsk->tgid) -+ break; -+ obj = tobj; -+ } -+ } -+ -+ cpt_close_object(ctx); -+ return 0; -+} -+ -+int cpt_dump_tasks(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ cpt_open_section(ctx, CPT_SECT_TASKS); -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ int err; -+ -+ if ((err = dump_one_process(obj, ctx)) != 0) -+ return err; -+ } -+ -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+int cpt_collect_signals(cpt_context_t *ctx) -+{ -+ cpt_object_t *obj; -+ -+ /* Collect process fd sets */ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) { -+ eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, virt_pid(tsk), tsk->comm); -+ return -EBUSY; -+ } -+ if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL) -+ return -ENOMEM; -+ if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL) -+ return -ENOMEM; -+ } -+ return 0; -+} -+ -+ -+static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct sighand_struct *sig = obj->o_obj; -+ struct cpt_sighand_image *v = cpt_get_buf(ctx); -+ int i; -+ -+ cpt_open_object(obj, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_SIGHAND_STRUCT; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ for (i=0; i< _NSIG; i++) { -+ if (sig->action[i].sa.sa_handler != SIG_DFL) { -+ loff_t saved_obj; -+ struct cpt_sighandler_image *o = cpt_get_buf(ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ cpt_open_object(NULL, ctx); -+ -+ o->cpt_next = CPT_NULL; -+ o->cpt_object = CPT_OBJ_SIGHANDLER; -+ o->cpt_hdrlen = sizeof(*o); -+ o->cpt_content = CPT_CONTENT_VOID; -+ -+ o->cpt_signo = i; -+ o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler; -+ o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer; -+ o->cpt_flags = sig->action[i].sa.sa_flags; -+ memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8); -+ ctx->write(o, sizeof(*o), ctx); -+ cpt_release_buf(ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ } -+ } -+ -+ cpt_close_object(ctx); -+ return 0; -+} -+ -+int cpt_dump_sighand(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT); -+ -+ for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) { -+ int err; -+ -+ if ((err = dump_one_sighand_struct(obj, ctx)) != 0) -+ return err; -+ } -+ -+ cpt_close_section(ctx); -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_process.h linux-2.6.16-026test009/kernel/cpt/cpt_process.h ---- linux-2.6.16.orig/kernel/cpt/cpt_process.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_process.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,10 @@ -+int cpt_collect_signals(cpt_context_t *); -+int cpt_dump_signal(struct cpt_context *); -+int cpt_dump_sighand(struct cpt_context *); -+int cpt_dump_tasks(struct cpt_context *); -+ -+int rst_signal_complete(struct cpt_task_image *ti, struct cpt_context *ctx); -+__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx); -+ -+int rst_restore_process(struct cpt_context *ctx); -+int rst_process_linkage(struct cpt_context *ctx); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket.c linux-2.6.16-026test009/kernel/cpt/cpt_socket.c ---- linux-2.6.16.orig/kernel/cpt/cpt_socket.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_socket.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,757 @@ -+/* -+ * -+ * kernel/cpt/cpt_socket.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/fs.h> -+#include <linux/socket.h> -+#include <linux/un.h> -+#include <linux/tcp.h> -+#include <net/sock.h> -+#include <net/scm.h> -+#include <net/af_unix.h> -+#include <net/tcp.h> -+#include <net/netlink_sock.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_socket.h" -+#include "cpt_files.h" -+#include "cpt_kernel.h" -+ -+static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx); -+ -+ -+/* Sockets are quite different of another kinds of files. -+ * There is one simplification: only one struct file can refer to a socket, -+ * so we could store information about socket directly in section FILES as -+ * a description of a file and append f.e. array of not-yet-accepted -+ * connections of listening socket as array of auxiliary data. -+ * -+ * Complications are: -+ * 1. TCP sockets can be orphans. We have to relocate orphans as well, -+ * so we have to create special section for orphans. -+ * 2. AF_UNIX sockets are distinguished objects: set of links between -+ * AF_UNIX sockets is quite arbitrary. -+ * A. Each socket can refers to many of files due to FD passing. -+ * B. Each socket except for connected ones can have in queue skbs -+ * sent by any of sockets. -+ * -+ * 2A is relatively easy: after our tasks are frozen we make an additional -+ * recursive pass throgh set of collected files and get referenced to -+ * FD passed files. After end of recursion, all the files are treated -+ * in the same way. All they will be stored in section FILES. -+ * -+ * 2B. We have to resolve all those references at some point. -+ * It is the place where pipe-like approach to image fails. -+ * -+ * All this makes socket checkpointing quite chumbersome. -+ * Right now we collect all the sockets and assign some numeric index value -+ * to each of them. The socket section is separate and put after section FILES, -+ * so section FILES refers to sockets by index, section SOCKET refers to FILES -+ * as usual by position in image. All the refs inside socket section are -+ * by index. When restoring we read socket section, create objects to hold -+ * mappings index <-> pos. At the second pass we open sockets (simultaneosly -+ * with their pairs) and create FILE objects. -+ */ -+ -+ -+/* ====== FD passing ====== */ -+ -+/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we -+ * have to implement this. A problem is that in general case we receive -+ * skbs from an unknown context, so new files can arrive to checkpointed -+ * set of processes even after they are stopped. Well, we are going just -+ * to ignore unknown fds while doing real checkpointing. It is fair because -+ * links outside checkpointed set are going to fail anyway. -+ * -+ * ATTN: the procedure is recursive. We linearize the recursion adding -+ * newly found files to the end of file list, so they will be analyzed -+ * in the same loop. -+ */ -+ -+static int collect_one_passedfd(struct file *file, cpt_context_t * ctx) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ struct socket *sock; -+ struct sock *sk; -+ struct sk_buff *skb; -+ -+ if (!S_ISSOCK(inode->i_mode)) -+ return -ENOTSOCK; -+ -+ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; -+ -+ if (sock->ops->family != AF_UNIX) -+ return 0; -+ -+ sk = sock->sk; -+ -+ /* Subtle locking issue. skbs cannot be removed while -+ * we are scanning, because all the processes are stopped. -+ * They still can be added to tail of queue. Locking while -+ * we dereference skb->next is enough to resolve this. -+ * See above about collision with skbs added after we started -+ * checkpointing. -+ */ -+ -+ skb = skb_peek(&sk->sk_receive_queue); -+ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { -+ if (UNIXCB(skb).fp && skb->sk && -+ (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) { -+ struct scm_fp_list *fpl = UNIXCB(skb).fp; -+ int i; -+ -+ for (i = fpl->count-1; i >= 0; i--) { -+ if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL) -+ return -ENOMEM; -+ } -+ } -+ -+ spin_lock_irq(&sk->sk_receive_queue.lock); -+ skb = skb->next; -+ spin_unlock_irq(&sk->sk_receive_queue.lock); -+ } -+ -+ return 0; -+} -+ -+int cpt_collect_passedfds(cpt_context_t * ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file = obj->o_obj; -+ -+ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { -+ int err; -+ -+ if ((err = collect_one_passedfd(file, ctx)) < 0) -+ return err; -+ } -+ } -+ -+ return 0; -+} -+ -+/* ====== End of FD passing ====== */ -+ -+/* Must be called under bh_lock_sock() */ -+ -+void clear_backlog(struct sock *sk) -+{ -+ struct sk_buff *skb = sk->sk_backlog.head; -+ -+ sk->sk_backlog.head = sk->sk_backlog.tail = NULL; -+ while (skb) { -+ struct sk_buff *next = skb->next; -+ -+ skb->next = NULL; -+ kfree_skb(skb); -+ skb = next; -+ } -+} -+ -+void release_sock_nobacklog(struct sock *sk) -+{ -+ spin_lock_bh(&(sk->sk_lock.slock)); -+ clear_backlog(sk); -+ sk->sk_lock.owner = NULL; -+ if (waitqueue_active(&(sk->sk_lock.wq))) -+ wake_up(&(sk->sk_lock.wq)); -+ spin_unlock_bh(&(sk->sk_lock.slock)); -+} -+ -+int cpt_dump_skb(int type, int owner, struct sk_buff *skb, -+ struct cpt_context *ctx) -+{ -+ struct cpt_skb_image *v = cpt_get_buf(ctx); -+ loff_t saved_obj; -+ struct timeval tmptv; -+ -+ cpt_push_object(&saved_obj, ctx); -+ cpt_open_object(NULL, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_SKB; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_owner = owner; -+ v->cpt_queue = type; -+ skb_get_timestamp(skb, &tmptv); -+ v->cpt_stamp = cpt_timeval_export(&tmptv); -+ v->cpt_hspace = skb->data - skb->head; -+ v->cpt_tspace = skb->end - skb->tail; -+ v->cpt_h = skb->h.raw - skb->head; -+ v->cpt_nh = skb->nh.raw - skb->head; -+ v->cpt_mac = skb->mac.raw - skb->head; -+ if (sizeof(skb->cb) < sizeof(v->cpt_cb)) BUG(); -+ memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); -+ if (sizeof(skb->cb) > sizeof(v->cpt_cb)) { -+ int i; -+ for (i=sizeof(v->cpt_cb); i<sizeof(skb->cb); i++) { -+ if (skb->cb[i]) { -+ wprintk_ctx("dirty skb cb"); -+ break; -+ } -+ } -+ } -+ v->cpt_len = skb->len; -+ v->cpt_mac_len = skb->mac_len; -+ v->cpt_csum = skb->csum; -+ v->cpt_local_df = skb->local_df; -+ v->cpt_pkt_type = skb->pkt_type; -+ v->cpt_ip_summed = skb->ip_summed; -+ v->cpt_priority = skb->priority; -+ v->cpt_protocol = skb->protocol; -+ v->cpt_security = 0; -+ v->cpt_tso_segs = skb_shinfo(skb)->tso_segs; -+ v->cpt_tso_size = skb_shinfo(skb)->tso_size; -+ if (skb_shinfo(skb)->ufo_size) { -+ eprintk_ctx("skb ufo is not supported\n"); -+ return -EINVAL; -+ } -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ if (skb->len + (skb->data - skb->head) > 0) { -+ struct cpt_obj_bits ob; -+ loff_t saved_obj2; -+ -+ cpt_push_object(&saved_obj2, ctx); -+ cpt_open_object(NULL, ctx); -+ ob.cpt_next = CPT_NULL; -+ ob.cpt_object = CPT_OBJ_BITS; -+ ob.cpt_hdrlen = sizeof(ob); -+ ob.cpt_content = CPT_CONTENT_DATA; -+ ob.cpt_size = skb->len + v->cpt_hspace; -+ -+ ctx->write(&ob, sizeof(ob), ctx); -+ -+ ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx); -+ if (skb->data_len) { -+ int offset = skb->len - skb->data_len; -+ while (offset < skb->len) { -+ int copy = skb->len - offset; -+ if (copy > PAGE_SIZE) -+ copy = PAGE_SIZE; -+ (void)cpt_get_buf(ctx); -+ if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy)) -+ BUG(); -+ ctx->write(ctx->tmpbuf, copy, ctx); -+ __cpt_release_buf(ctx); -+ offset += copy; -+ } -+ } -+ -+ ctx->align(ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_obj2, ctx); -+ } -+ -+ if (skb->sk && skb->sk->sk_family == AF_UNIX) { -+ struct scm_fp_list *fpl = UNIXCB(skb).fp; -+ -+ if (fpl) { -+ int i; -+ -+ for (i = 0; i < fpl->count; i++) { -+ struct cpt_fd_image v; -+ cpt_object_t *obj; -+ loff_t saved_obj2; -+ -+ obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx); -+ -+ if (!obj) { -+ eprintk_ctx("lost passed FD\n"); -+ return -EINVAL; -+ } -+ -+ cpt_push_object(&saved_obj2, ctx); -+ cpt_open_object(NULL, ctx); -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_FILEDESC; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_VOID; -+ -+ v.cpt_fd = i; -+ v.cpt_file = obj->o_pos; -+ v.cpt_flags = 0; -+ ctx->write(&v, sizeof(v), ctx); -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_obj2, ctx); -+ } -+ } -+ } -+ -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ return 0; -+} -+ -+static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx) -+{ -+ struct sk_buff *skb; -+ struct sock *sk_cache = NULL; -+ -+ skb = skb_peek(&sk->sk_receive_queue); -+ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { -+ int err; -+ -+ if (sk->sk_family == AF_UNIX) { -+ cpt_object_t *obj; -+ if (skb->sk != sk_cache) { -+ idx = -1; -+ sk_cache = NULL; -+ obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx); -+ if (obj) { -+ idx = obj->o_index; -+ sk_cache = skb->sk; -+ } else if (unix_peer(sk) != skb->sk) -+ goto next_skb; -+ } -+ } -+ -+ err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx); -+ if (err) -+ return err; -+ -+next_skb: -+ spin_lock_irq(&sk->sk_receive_queue.lock); -+ skb = skb->next; -+ spin_unlock_irq(&sk->sk_receive_queue.lock); -+ } -+ return 0; -+} -+ -+static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx) -+{ -+ struct sk_buff *skb; -+ -+ skb = skb_peek(&sk->sk_write_queue); -+ while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) { -+ int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx); -+ if (err) -+ return err; -+ -+ spin_lock_irq(&sk->sk_write_queue.lock); -+ skb = skb->next; -+ spin_unlock_irq(&sk->sk_write_queue.lock); -+ } -+ return 0; -+} -+ -+ -+/* Dump socket content */ -+ -+int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx) -+{ -+ struct cpt_sock_image *v = cpt_get_buf(ctx); -+ struct socket *sock; -+ -+ cpt_open_object(obj, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_SOCKET; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_file = CPT_NULL; -+ sock = sk->sk_socket; -+ if (sock && sock->file) { -+ cpt_object_t *tobj; -+ tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx); -+ if (tobj) -+ v->cpt_file = tobj->o_pos; -+ } -+ v->cpt_index = index; -+ v->cpt_parent = parent; -+ -+ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { -+ if (sock && !obj->o_lock) { -+ lock_sock(sk); -+ obj->o_lock = 1; -+ } -+ } -+ -+ /* Some bits stored in inode */ -+ v->cpt_ssflags = sock ? sock->flags : 0; -+ v->cpt_sstate = sock ? sock->state : 0; -+ v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0; -+ -+ /* Common data */ -+ v->cpt_family = sk->sk_family; -+ v->cpt_type = sk->sk_type; -+ v->cpt_state = sk->sk_state; -+ v->cpt_reuse = sk->sk_reuse; -+ v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED); -+ v->cpt_shutdown = sk->sk_shutdown; -+ v->cpt_userlocks = sk->sk_userlocks; -+ v->cpt_no_check = sk->sk_no_check; -+ v->cpt_zapped = sock_flag(sk, SOCK_DBG); -+ v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP); -+ v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE); -+ v->cpt_protocol = sk->sk_protocol; -+ v->cpt_err = sk->sk_err; -+ v->cpt_err_soft = sk->sk_err_soft; -+ v->cpt_max_ack_backlog = sk->sk_max_ack_backlog; -+ v->cpt_priority = sk->sk_priority; -+ v->cpt_rcvlowat = sk->sk_rcvlowat; -+ v->cpt_rcvtimeo = CPT_NULL; -+ if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT) -+ v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo; -+ v->cpt_sndtimeo = CPT_NULL; -+ if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT) -+ v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo; -+ v->cpt_rcvbuf = sk->sk_rcvbuf; -+ v->cpt_sndbuf = sk->sk_sndbuf; -+ v->cpt_bound_dev_if = sk->sk_bound_dev_if; -+ v->cpt_flags = sk->sk_flags; -+ v->cpt_lingertime = CPT_NULL; -+ if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT) -+ v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime; -+ v->cpt_peer_pid = sk->sk_peercred.pid; -+ v->cpt_peer_uid = sk->sk_peercred.uid; -+ v->cpt_peer_gid = sk->sk_peercred.gid; -+ v->cpt_stamp = cpt_timeval_export(&sk->sk_stamp); -+ -+ if (sk->sk_filter) { -+ eprintk_ctx("checkpointing sk_filter is not implemented\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ -+ v->cpt_peer = -1; -+ v->cpt_socketpair = 0; -+ v->cpt_deleted = 0; -+ -+ v->cpt_laddrlen = 0; -+ if (sock) { -+ int alen = sizeof(v->cpt_laddr); -+ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ v->cpt_laddrlen = alen; -+ } -+ v->cpt_raddrlen = 0; -+ if (sock) { -+ int alen = sizeof(v->cpt_raddr); -+ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2); -+ if (!err) -+ v->cpt_raddrlen = alen; -+ } -+ -+ if (sk->sk_family == AF_UNIX) { -+ if (unix_sk(sk)->dentry) { -+ struct dentry *d = unix_sk(sk)->dentry; -+ v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d); -+ if (!v->cpt_deleted) { -+ int err = 0; -+ char *path; -+ unsigned long pg = __get_free_page(GFP_KERNEL); -+ -+ if (!pg) { -+ cpt_release_buf(ctx); -+ return -ENOMEM; -+ } -+ -+ path = d_path(d, unix_sk(sk)->mnt, (char *)pg, PAGE_SIZE); -+ -+ if (!IS_ERR(path)) { -+ int len = strlen(path); -+ if (len < 126) { -+ strcpy(((char*)v->cpt_laddr)+2, path); -+ v->cpt_laddrlen = len + 2; -+ } else { -+ wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2); -+ } -+ err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx); -+ } else { -+ eprintk_ctx("cannot get path of an af_unix socket\n"); -+ err = PTR_ERR(path); -+ } -+ free_page(pg); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ } -+ } -+ -+ /* If the socket is connected, find its peer. If peer is not -+ * in our table, the socket is connected to external process -+ * and we consider it disconnected. -+ */ -+ if (unix_peer(sk)) { -+ cpt_object_t *pobj; -+ pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx); -+ if (pobj) -+ v->cpt_peer = pobj->o_index; -+ else -+ v->cpt_shutdown = SHUTDOWN_MASK; -+ -+ if (unix_peer(unix_peer(sk)) == sk) -+ v->cpt_socketpair = 1; -+ } -+ -+ /* If the socket shares address with another socket it is -+ * child of some listening socket. Find and record it. */ -+ if (unix_sk(sk)->addr && -+ atomic_read(&unix_sk(sk)->addr->refcnt) > 1 && -+ sk->sk_state != TCP_LISTEN) { -+ cpt_object_t *pobj; -+ for_each_object(pobj, CPT_OBJ_SOCKET) { -+ struct sock *psk = pobj->o_obj; -+ if (psk->sk_family == AF_UNIX && -+ psk->sk_state == TCP_LISTEN && -+ unix_sk(psk)->addr == unix_sk(sk)->addr) { -+ v->cpt_parent = pobj->o_index; -+ break; -+ } -+ } -+ } -+ } -+ -+ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) -+ cpt_dump_socket_in(v, sk, ctx); -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ dump_rqueue(index, sk, ctx); -+ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { -+ dump_wqueue(index, sk, ctx); -+ cpt_dump_ofo_queue(index, sk, ctx); -+ } -+ -+ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) -+ && sk->sk_state == TCP_LISTEN) -+ cpt_dump_synwait_queue(sk, index, ctx); -+ -+ cpt_close_object(ctx); -+ -+ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) -+ && sk->sk_state == TCP_LISTEN) -+ cpt_dump_accept_queue(sk, index, ctx); -+ -+ return 0; -+} -+ -+int cpt_dump_orphaned_sockets(struct cpt_context *ctx) -+{ -+ int i; -+ -+ cpt_open_section(ctx, CPT_SECT_ORPHANS); -+ -+ for (i = 0; i < tcp_hashinfo.ehash_size; i++) { -+ struct sock *sk; -+ struct hlist_node *node; -+ -+retry: -+ read_lock_bh(&tcp_hashinfo.ehash[i].lock); -+ sk_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) { -+ -+ if (VE_OWNER_SK(sk) != get_exec_env()) -+ continue; -+ if (sk->sk_socket) -+ continue; -+ if (!sock_flag(sk, SOCK_DEAD)) -+ continue; -+ if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx)) -+ continue; -+ sock_hold(sk); -+ read_unlock_bh(&tcp_hashinfo.ehash[i].lock); -+ -+ local_bh_disable(); -+ bh_lock_sock(sk); -+ if (sock_owned_by_user(sk)) -+ eprintk_ctx("BUG: sk locked by whom?\n"); -+ sk->sk_lock.owner = (void *)1; -+ bh_unlock_sock(sk); -+ local_bh_enable(); -+ -+ cpt_dump_socket(NULL, sk, -1, -1, ctx); -+ -+ local_bh_disable(); -+ bh_lock_sock(sk); -+ sk->sk_lock.owner = NULL; -+ clear_backlog(sk); -+ tcp_done(sk); -+ bh_unlock_sock(sk); -+ local_bh_enable(); -+ sock_put(sk); -+ -+ goto retry; -+ } -+ read_unlock_bh(&tcp_hashinfo.ehash[i].lock); -+ } -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+static int can_dump(struct sock *sk, cpt_context_t *ctx) -+{ -+ switch (sk->sk_family) { -+ case AF_NETLINK: -+ if (((struct netlink_sock *)sk)->cb) { -+ eprintk_ctx("netlink socket has active callback\n"); -+ return 0; -+ } -+ break; -+ } -+ return 1; -+} -+ -+/* We are not going to block suspend when we have external AF_UNIX connections. -+ * But we cannot stop feed of new packets/connections to our environment -+ * from outside. Taking into account that it is intrincically unreliable, -+ * we collect some amount of data, but when checkpointing/restoring we -+ * are going to drop everything, which does not make sense: skbs sent -+ * by outside processes, connections from outside etc. etc. -+ */ -+ -+/* The first pass. When we see socket referenced by a file, we just -+ * add it to socket table */ -+int cpt_collect_socket(struct file *file, cpt_context_t * ctx) -+{ -+ cpt_object_t *obj; -+ struct socket *sock; -+ struct sock *sk; -+ -+ if (!S_ISSOCK(file->f_dentry->d_inode->i_mode)) -+ return -ENOTSOCK; -+ sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket; -+ sk = sock->sk; -+ if (!can_dump(sk, ctx)) -+ return -EBUSY; -+ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL) -+ return -ENOMEM; -+ obj->o_parent = file; -+ -+ return 0; -+} -+ -+/* -+ * We should end with table containing: -+ * * all sockets opened by our processes in the table. -+ * * all the sockets queued in listening queues on _our_ listening sockets, -+ * which are connected to our opened sockets. -+ */ -+ -+static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx) -+{ -+ struct sock *sk = obj->o_obj; -+ cpt_object_t *cobj; -+ struct sk_buff *skb; -+ -+ skb = skb_peek(&sk->sk_receive_queue); -+ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { -+ struct sock *lsk = skb->sk; -+ if (unix_peer(lsk) && -+ lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) { -+ if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL) -+ return -ENOMEM; -+ cobj->o_parent = obj->o_parent; -+ } -+ spin_lock_irq(&sk->sk_receive_queue.lock); -+ skb = skb->next; -+ spin_unlock_irq(&sk->sk_receive_queue.lock); -+ } -+ -+ return 0; -+} -+ -+int cpt_index_sockets(cpt_context_t * ctx) -+{ -+ cpt_object_t *obj; -+ unsigned long index = 0; -+ -+ /* Collect not-yet-accepted children of listening sockets. */ -+ for_each_object(obj, CPT_OBJ_SOCKET) { -+ struct sock *sk = obj->o_obj; -+ -+ if (sk->sk_state != TCP_LISTEN) -+ continue; -+ -+ if (sk->sk_family == AF_UNIX) -+ collect_one_unix_listening_sock(obj, ctx); -+ } -+ -+ /* Assign indices to all the sockets. */ -+ for_each_object(obj, CPT_OBJ_SOCKET) { -+ struct sock *sk = obj->o_obj; -+ cpt_obj_setindex(obj, index++, ctx); -+ -+ if (sk->sk_socket && sk->sk_socket->file) { -+ cpt_object_t *tobj; -+ tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx); -+ if (tobj) -+ cpt_obj_setindex(tobj, obj->o_index, ctx); -+ } -+ } -+ -+ return 0; -+} -+ -+void cpt_unlock_sockets(cpt_context_t * ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_SOCKET) { -+ struct sock *sk = obj->o_obj; -+ if (sk && obj->o_lock) { -+ if (sk->sk_socket) -+ release_sock(sk); -+ } -+ } -+} -+ -+void cpt_kill_sockets(cpt_context_t * ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_SOCKET) { -+ struct sock *sk = obj->o_obj; -+ if (sk && obj->o_lock) { -+ cpt_kill_socket(sk, ctx); -+ if (sk->sk_socket) -+ release_sock_nobacklog(sk); -+ } -+ } -+} -+ -+__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx) -+{ -+ struct fasync_struct *fa; -+ struct inode *inode = file->f_dentry->d_inode; -+ struct socket *sock; -+ -+ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; -+ -+ for (fa = sock->fasync_list; fa; fa = fa->fa_next) { -+ if (fa->fa_file == file) -+ return fa->fa_fd; -+ } -+ return -1; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket.h linux-2.6.16-026test009/kernel/cpt/cpt_socket.h ---- linux-2.6.16.orig/kernel/cpt/cpt_socket.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_socket.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,26 @@ -+struct sock; -+ -+int cpt_collect_passedfds(cpt_context_t *); -+int cpt_index_sockets(cpt_context_t *); -+int cpt_collect_socket(struct file *, cpt_context_t *); -+int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx); -+int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx); -+int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx); -+int rst_sockets(struct cpt_context *ctx); -+int rst_sockets_complete(struct cpt_context *ctx); -+int cpt_dump_orphaned_sockets(struct cpt_context *ctx); -+ -+struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx); -+ -+void cpt_unlock_sockets(cpt_context_t *); -+void cpt_kill_sockets(cpt_context_t *); -+ -+ -+int cpt_kill_socket(struct sock *, cpt_context_t *); -+int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*); -+int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx); -+__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx); -+int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *); -+int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx); -+int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx); -+int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket_in.c linux-2.6.16-026test009/kernel/cpt/cpt_socket_in.c ---- linux-2.6.16.orig/kernel/cpt/cpt_socket_in.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_socket_in.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,372 @@ -+/* -+ * -+ * kernel/cpt/cpt_socket_in.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/fs.h> -+#include <linux/socket.h> -+#include <linux/tcp.h> -+#include <net/sock.h> -+#include <net/tcp.h> -+#include <linux/ipv6.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_socket.h" -+#include "cpt_kernel.h" -+ -+static inline __u32 jiffies_export(unsigned long tmo) -+{ -+ __s32 delta = (long)(tmo - jiffies); -+ return delta; -+} -+ -+static inline __u32 tcp_jiffies_export(__u32 tmo) -+{ -+ __s32 delta = tmo - tcp_time_stamp; -+ return delta; -+} -+ -+int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx) -+{ -+ struct sk_buff *skb; -+ struct tcp_sock *tp; -+ -+ if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP) -+ return 0; -+ -+ tp = tcp_sk(sk); -+ -+ skb = skb_peek(&tp->out_of_order_queue); -+ while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) { -+ int err; -+ -+ err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx); -+ if (err) -+ return err; -+ -+ spin_lock_irq(&tp->out_of_order_queue.lock); -+ skb = skb->next; -+ spin_unlock_irq(&tp->out_of_order_queue.lock); -+ } -+ return 0; -+} -+ -+static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk, -+ struct cpt_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ si->cpt_pred_flags = tp->pred_flags; -+ si->cpt_rcv_nxt = tp->rcv_nxt; -+ si->cpt_snd_nxt = tp->snd_nxt; -+ si->cpt_snd_una = tp->snd_una; -+ si->cpt_snd_sml = tp->snd_sml; -+ si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp); -+ si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime); -+ si->cpt_tcp_header_len = tp->tcp_header_len; -+ si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending; -+ si->cpt_quick = inet_csk(sk)->icsk_ack.quick; -+ si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong; -+ si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked; -+ si->cpt_ato = inet_csk(sk)->icsk_ack.ato; -+ si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout); -+ si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime); -+ si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size; -+ si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss; -+ si->cpt_snd_wl1 = tp->snd_wl1; -+ si->cpt_snd_wnd = tp->snd_wnd; -+ si->cpt_max_window = tp->max_window; -+ si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie; -+ si->cpt_mss_cache = tp->mss_cache; -+ si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */ -+ si->cpt_mss_clamp = tp->rx_opt.mss_clamp; -+ si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len; -+ si->cpt_ext2_header_len = 0; -+ si->cpt_ca_state = inet_csk(sk)->icsk_ca_state; -+ si->cpt_retransmits = inet_csk(sk)->icsk_retransmits; -+ si->cpt_reordering = tp->reordering; -+ si->cpt_frto_counter = tp->frto_counter; -+ si->cpt_frto_highmark = tp->frto_highmark; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) -+ // // si->cpt_adv_cong = tp->adv_cong; -+#endif -+ si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept; -+ si->cpt_backoff = inet_csk(sk)->icsk_backoff; -+ si->cpt_srtt = tp->srtt; -+ si->cpt_mdev = tp->mdev; -+ si->cpt_mdev_max = tp->mdev_max; -+ si->cpt_rttvar = tp->rttvar; -+ si->cpt_rtt_seq = tp->rtt_seq; -+ si->cpt_rto = inet_csk(sk)->icsk_rto; -+ si->cpt_packets_out = tp->packets_out; -+ si->cpt_left_out = tp->left_out; -+ si->cpt_retrans_out = tp->retrans_out; -+ si->cpt_lost_out = tp->lost_out; -+ si->cpt_sacked_out = tp->sacked_out; -+ si->cpt_fackets_out = tp->fackets_out; -+ si->cpt_snd_ssthresh = tp->snd_ssthresh; -+ si->cpt_snd_cwnd = tp->snd_cwnd; -+ si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt; -+ si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp; -+ si->cpt_snd_cwnd_used = tp->snd_cwnd_used; -+ si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp); -+ si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout); -+ si->cpt_ka_timeout = 0; -+ si->cpt_rcv_wnd = tp->rcv_wnd; -+ si->cpt_rcv_wup = tp->rcv_wup; -+ si->cpt_write_seq = tp->write_seq; -+ si->cpt_pushed_seq = tp->pushed_seq; -+ si->cpt_copied_seq = tp->copied_seq; -+ si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok; -+ si->cpt_wscale_ok = tp->rx_opt.wscale_ok; -+ si->cpt_sack_ok = tp->rx_opt.sack_ok; -+ si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp; -+ si->cpt_snd_wscale = tp->rx_opt.snd_wscale; -+ si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale; -+ si->cpt_nonagle = tp->nonagle; -+ si->cpt_keepalive_probes = tp->keepalive_probes; -+ si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval; -+ si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr; -+ si->cpt_ts_recent = tp->rx_opt.ts_recent; -+ si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; -+ si->cpt_user_mss = tp->rx_opt.user_mss; -+ si->cpt_dsack = tp->rx_opt.dsack; -+ si->cpt_eff_sacks = tp->rx_opt.eff_sacks; -+ si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq; -+ si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq; -+ si->cpt_sack_array[2] = tp->selective_acks[0].start_seq; -+ si->cpt_sack_array[3] = tp->selective_acks[0].end_seq; -+ si->cpt_sack_array[4] = tp->selective_acks[1].start_seq; -+ si->cpt_sack_array[5] = tp->selective_acks[1].end_seq; -+ si->cpt_sack_array[6] = tp->selective_acks[2].start_seq; -+ si->cpt_sack_array[7] = tp->selective_acks[2].end_seq; -+ si->cpt_sack_array[8] = tp->selective_acks[3].start_seq; -+ si->cpt_sack_array[9] = tp->selective_acks[3].end_seq; -+ si->cpt_window_clamp = tp->window_clamp; -+ si->cpt_rcv_ssthresh = tp->rcv_ssthresh; -+ si->cpt_probes_out = inet_csk(sk)->icsk_probes_out; -+ si->cpt_num_sacks = tp->rx_opt.num_sacks; -+ si->cpt_advmss = tp->advmss; -+ si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries; -+ si->cpt_ecn_flags = tp->ecn_flags; -+ si->cpt_prior_ssthresh = tp->prior_ssthresh; -+ si->cpt_high_seq = tp->high_seq; -+ si->cpt_retrans_stamp = tp->retrans_stamp; -+ si->cpt_undo_marker = tp->undo_marker; -+ si->cpt_undo_retrans = tp->undo_retrans; -+ si->cpt_urg_seq = tp->urg_seq; -+ si->cpt_urg_data = tp->urg_data; -+ si->cpt_pending = inet_csk(sk)->icsk_pending; -+ si->cpt_urg_mode = tp->urg_mode; -+ si->cpt_snd_up = tp->snd_up; -+ si->cpt_keepalive_time = tp->keepalive_time; -+ si->cpt_keepalive_intvl = tp->keepalive_intvl; -+ si->cpt_linger2 = tp->linger2; -+ -+ if (sk->sk_state != TCP_LISTEN && -+ sk->sk_state != TCP_CLOSE && -+ sock_flag(sk, SOCK_KEEPOPEN)) { -+ si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires); -+ } -+ -+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -+ if (sk->sk_family == AF_INET6 && tp->af_specific == &ipv6_mapped) -+ si->cpt_mapped = 1; -+#endif -+ -+ return 0; -+} -+ -+ -+int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk, -+ struct cpt_context *ctx) -+{ -+ struct inet_sock *inet = inet_sk(sk); -+ struct ipv6_pinfo *np = inet6_sk(sk); -+ -+ if (sk->sk_family == AF_INET) { -+ struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr); -+ sin->sin_family = AF_INET; -+ sin->sin_port = inet->sport; -+ sin->sin_addr.s_addr = inet->rcv_saddr; -+ si->cpt_laddrlen = sizeof(*sin); -+ } else if (sk->sk_family == AF_INET6) { -+ struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr); -+ sin6->sin6_family = AF_INET6; -+ sin6->sin6_port = inet->sport; -+ memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16); -+ si->cpt_laddrlen = sizeof(*sin6); -+ } -+ if (!inet->num) -+ si->cpt_laddrlen = 0; -+ -+ si->cpt_daddr = inet->daddr; -+ si->cpt_dport = inet->dport; -+ si->cpt_saddr = inet->saddr; -+ si->cpt_rcv_saddr = inet->rcv_saddr; -+ si->cpt_sport = inet->sport; -+ si->cpt_uc_ttl = inet->uc_ttl; -+ si->cpt_tos = inet->tos; -+ si->cpt_cmsg_flags = inet->cmsg_flags; -+ si->cpt_mc_index = inet->mc_index; -+ si->cpt_mc_addr = inet->mc_addr; -+ si->cpt_hdrincl = inet->hdrincl; -+ si->cpt_mc_ttl = inet->mc_ttl; -+ si->cpt_mc_loop = inet->mc_loop; -+ si->cpt_pmtudisc = inet->pmtudisc; -+ si->cpt_recverr = inet->recverr; -+ si->cpt_freebind = inet->freebind; -+ si->cpt_idcounter = inet->id; -+ -+ si->cpt_cork_flags = inet->cork.flags; -+ si->cpt_cork_fragsize = 0; -+ si->cpt_cork_length = inet->cork.length; -+ si->cpt_cork_addr = inet->cork.addr; -+ si->cpt_cork_saddr = inet->cork.fl.fl4_src; -+ si->cpt_cork_daddr = inet->cork.fl.fl4_dst; -+ si->cpt_cork_oif = inet->cork.fl.oif; -+ if (inet->cork.rt) { -+ si->cpt_cork_fragsize = inet->cork.fragsize; -+ si->cpt_cork_saddr = inet->cork.rt->fl.fl4_src; -+ si->cpt_cork_daddr = inet->cork.rt->fl.fl4_dst; -+ si->cpt_cork_oif = inet->cork.rt->fl.oif; -+ } -+ -+ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { -+ struct udp_sock *up = udp_sk(sk); -+ si->cpt_udp_pending = up->pending; -+ si->cpt_udp_corkflag = up->corkflag; -+ si->cpt_udp_encap = up->encap_type; -+ si->cpt_udp_len = up->len; -+ } -+ -+ if (sk->sk_family == AF_INET6) { -+ memcpy(si->cpt_saddr6, &np->saddr, 16); -+ memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16); -+ memcpy(si->cpt_daddr6, &np->daddr, 16); -+ si->cpt_flow_label6 = np->flow_label; -+ si->cpt_frag_size6 = np->frag_size; -+ si->cpt_hop_limit6 = np->hop_limit; -+ si->cpt_mcast_hops6 = np->mcast_hops; -+ si->cpt_mcast_oif6 = np->mcast_oif; -+ si->cpt_rxopt6 = np->rxopt.all; -+ si->cpt_mc_loop6 = np->mc_loop; -+ si->cpt_recverr6 = np->recverr; -+ si->cpt_sndflow6 = np->sndflow; -+ si->cpt_pmtudisc6 = np->pmtudisc; -+ si->cpt_ipv6only6 = np->ipv6only; -+ si->cpt_mapped = 0; -+ } -+ -+ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) -+ cpt_dump_socket_tcp(si, sk, ctx); -+ -+ return 0; -+} -+ -+int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx) -+{ -+ struct request_sock *req; -+ -+ for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next) -+ cpt_dump_socket(NULL, req->sk, -1, index, ctx); -+ return 0; -+} -+ -+ -+static int dump_openreq(struct request_sock *req, struct sock *sk, int index, -+ struct cpt_context *ctx) -+{ -+ struct cpt_openreq_image *v = cpt_get_buf(ctx); -+ -+ cpt_open_object(NULL, ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_OPENREQ; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_VOID; -+ -+ v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn; -+ v->cpt_snt_isn = tcp_rsk(req)->snt_isn; -+ v->cpt_rmt_port = inet_rsk(req)->rmt_port; -+ v->cpt_mss = req->mss; -+ // // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6); -+ v->cpt_retrans = req->retrans; -+ v->cpt_snd_wscale = inet_rsk(req)->snd_wscale; -+ v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale; -+ v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok; -+ v->cpt_sack_ok = inet_rsk(req)->sack_ok; -+ v->cpt_wscale_ok = inet_rsk(req)->wscale_ok; -+ v->cpt_ecn_ok = inet_rsk(req)->ecn_ok; -+ v->cpt_acked = inet_rsk(req)->acked; -+ v->cpt_window_clamp = req->window_clamp; -+ v->cpt_rcv_wnd = req->rcv_wnd; -+ v->cpt_ts_recent = req->ts_recent; -+ v->cpt_expires = jiffies_export(req->expires); -+ -+ if (v->cpt_family == AF_INET) { -+ memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4); -+ memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4); -+ } else { -+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -+ memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16); -+ memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16); -+ v->cpt_iif = inet6_rsk(req)->iif; -+#endif -+ } -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ cpt_close_object(ctx); -+ return 0; -+} -+ -+int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx) -+{ -+ struct listen_sock *lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; -+ struct request_sock *req; -+ int i; -+ -+ for (i=0; i<TCP_SYNQ_HSIZE; i++) { -+ for (req=lopt->syn_table[i]; req; req=req->dl_next) { -+ loff_t saved_obj; -+ cpt_push_object(&saved_obj, ctx); -+ dump_openreq(req, sk, index, ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ } -+ } -+ return 0; -+} -+ -+ -+int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx) -+{ -+ if (sk->sk_state != TCP_CLOSE && -+ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && -+ sk->sk_protocol == IPPROTO_TCP) { -+ if (sk->sk_state != TCP_LISTEN) -+ tcp_set_state(sk, TCP_CLOSE); -+ else -+ sk->sk_prot->disconnect(sk, 0); -+ } -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_syscalls.h linux-2.6.16-026test009/kernel/cpt/cpt_syscalls.h ---- linux-2.6.16.orig/kernel/cpt/cpt_syscalls.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_syscalls.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,95 @@ -+#include <linux/unistd.h> -+#include <linux/syscalls.h> -+#include <asm/uaccess.h> -+ -+#define WRAP(c, args) return sys_##c args -+#define WRAP2(c, args) int err; mm_segment_t oldfs; \ -+ oldfs = get_fs(); set_fs(KERNEL_DS); \ -+ err = sys_##c args ;\ -+ set_fs(oldfs); \ -+ return err -+ -+static inline int sc_close(int fd) -+{ -+ WRAP(close, (fd)); -+} -+ -+static inline int sc_dup2(int fd1, int fd2) -+{ -+ WRAP(dup2, (fd1, fd2)); -+} -+ -+static inline int sc_unlink(char *name) -+{ -+ WRAP2(unlink, (name)); -+} -+ -+static inline int sc_pipe(int *pfd) -+{ -+ return do_pipe(pfd); -+} -+ -+static inline int sc_mknod(char *name, int mode, int dev) -+{ -+ WRAP2(mknod, (name, mode, dev)); -+} -+ -+static inline int sc_chmod(char *name, int mode) -+{ -+ WRAP2(mkdir, (name, mode)); -+} -+ -+static inline int sc_chown(char *name, int uid, int gid) -+{ -+ WRAP2(chown, (name, uid, gid)); -+} -+ -+static inline int sc_mkdir(char *name, int mode) -+{ -+ WRAP2(mkdir, (name, mode)); -+} -+ -+static inline int sc_rmdir(char *name) -+{ -+ WRAP2(rmdir, (name)); -+} -+ -+static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags) -+{ -+ WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL)); -+} -+ -+static inline int sc_mprotect(unsigned long start, size_t len, -+ unsigned long prot) -+{ -+ WRAP(mprotect, (start, len, prot)); -+} -+ -+static inline int sc_mlock(unsigned long start, size_t len) -+{ -+ WRAP(mlock, (start, len)); -+} -+ -+static inline int sc_munlock(unsigned long start, size_t len) -+{ -+ WRAP(munlock, (start, len)); -+} -+ -+static inline int sc_remap_file_pages(unsigned long start, size_t len, -+ unsigned long prot, unsigned long pgoff, -+ unsigned long flags) -+{ -+ WRAP(remap_file_pages, (start, len, prot, pgoff, flags)); -+} -+ -+static inline int sc_waitx(int pid, int opt) -+{ -+ WRAP(wait4, (pid, NULL, opt, NULL)); -+} -+ -+static inline int sc_flock(int fd, int flags) -+{ -+ WRAP(flock, (fd, flags)); -+} -+ -+extern int sc_execve(char *cms, char **argv, char **env); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_sysvipc.c linux-2.6.16-026test009/kernel/cpt/cpt_sysvipc.c ---- linux-2.6.16.orig/kernel/cpt/cpt_sysvipc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_sysvipc.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,317 @@ -+/* -+ * -+ * kernel/cpt/cpt_sysvipc.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/major.h> -+#include <linux/pipe_fs_i.h> -+#include <linux/mman.h> -+#include <linux/shm.h> -+#include <linux/sem.h> -+#include <linux/msg.h> -+#include <asm/uaccess.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_kernel.h" -+ -+struct _warg { -+ struct file *file; -+ struct cpt_sysvshm_image *v; -+}; -+ -+static int dump_one_shm(struct shmid_kernel *shp, void *arg) -+{ -+ struct _warg *warg = arg; -+ struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v; -+ -+ if (shp->shm_file != warg->file) -+ return 0; -+ -+ v->cpt_key = shp->shm_perm.key; -+ v->cpt_uid = shp->shm_perm.uid; -+ v->cpt_gid = shp->shm_perm.gid; -+ v->cpt_cuid = shp->shm_perm.cuid; -+ v->cpt_cgid = shp->shm_perm.cgid; -+ v->cpt_mode = shp->shm_perm.mode; -+ v->cpt_seq = shp->shm_perm.seq; -+ -+ v->cpt_id = shp->id; -+ v->cpt_segsz = shp->shm_segsz; -+ v->cpt_atime = shp->shm_atim; -+ v->cpt_ctime = shp->shm_ctim; -+ v->cpt_dtime = shp->shm_dtim; -+ v->cpt_creator = shp->shm_cprid; -+ v->cpt_last = shp->shm_lprid; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) -+ v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1; -+#else -+ v->cpt_mlockuser = -1; -+#endif -+ return 1; -+} -+ -+int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx) -+{ -+ struct cpt_sysvshm_image *v = cpt_get_buf(ctx); -+ struct _warg warg; -+ -+ v->cpt_next = sizeof(*v); -+ v->cpt_object = CPT_OBJ_SYSV_SHM; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_VOID; -+ -+ warg.file = file; -+ warg.v = v; -+ if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) { -+ cpt_release_buf(ctx); -+ return -ESRCH; -+ } -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ return 0; -+} -+ -+ -+int match_sem(int id, struct sem_array *sema, void *arg) -+{ -+ if (id != (unsigned long)arg) -+ return 0; -+ return sema->sem_nsems + 1; -+} -+ -+static int get_sem_nsem(int id, cpt_context_t *ctx) -+{ -+ int res; -+ res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id); -+ if (res > 0) -+ return res - 1; -+ eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id); -+ return -ESRCH; -+} -+ -+static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx) -+{ -+ struct cpt_sysvsem_undo_image v; -+ loff_t saved_obj; -+ -+ cpt_open_object(NULL, ctx); -+ -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_SEMUNDO; -+ v.cpt_id = su->semid; -+ v.cpt_nsem = get_sem_nsem(su->semid, ctx); -+ if ((int)v.cpt_nsem < 0) -+ return -ESRCH; -+ -+ ctx->write(&v, sizeof(v), ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx); -+ cpt_pop_object(&saved_obj, ctx); -+ -+ cpt_close_object(ctx); -+ return 0; -+} -+ -+struct sem_warg { -+ int last_id; -+ struct cpt_sysvsem_image *v; -+}; -+ -+static int dump_one_sem(int id, struct sem_array *sma, void *arg) -+{ -+ struct sem_warg * warg = (struct sem_warg *)arg; -+ struct cpt_sysvsem_image *v = warg->v; -+ int i; -+ -+ if (warg->last_id != -1) { -+ if ((id % IPCMNI) <= warg->last_id) -+ return 0; -+ } -+ -+ v->cpt_next = sizeof(*v); -+ v->cpt_object = CPT_OBJ_SYSV_SEM; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_SEMARRAY; -+ -+ v->cpt_key = sma->sem_perm.key; -+ v->cpt_uid = sma->sem_perm.uid; -+ v->cpt_gid = sma->sem_perm.gid; -+ v->cpt_cuid = sma->sem_perm.cuid; -+ v->cpt_cgid = sma->sem_perm.cgid; -+ v->cpt_mode = sma->sem_perm.mode; -+ v->cpt_seq = sma->sem_perm.seq; -+ -+ v->cpt_id = id; -+ v->cpt_ctime = sma->sem_ctime; -+ v->cpt_otime = sma->sem_otime; -+ -+ for (i=0; i<sma->sem_nsems; i++) { -+ struct { -+ __u32 semval; -+ __u32 sempid; -+ } *s = (void*)v + v->cpt_next; -+ if (v->cpt_next >= PAGE_SIZE - sizeof(*s)) -+ return -EINVAL; -+ s->semval = sma->sem_base[i].semval; -+ s->sempid = sma->sem_base[i].sempid; -+ v->cpt_next += sizeof(*s); -+ } -+ -+ warg->last_id = id % IPCMNI; -+ return 1; -+} -+ -+ -+int cpt_dump_sysvsem(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ struct sem_warg warg; -+ -+ /* Dumping semaphores is quite tricky because we cannot -+ * write to dump file under lock inside sysvipc_walk_sem(). -+ */ -+ cpt_open_section(ctx, CPT_SECT_SYSV_SEM); -+ warg.last_id = -1; -+ warg.v = cpt_get_buf(ctx); -+ for (;;) { -+ if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0) -+ break; -+ ctx->write(warg.v, warg.v->cpt_next, ctx); -+ } -+ cpt_release_buf(ctx); -+ cpt_close_section(ctx); -+ -+ cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO); -+ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { -+ struct sem_undo_list *semu = obj->o_obj; -+ struct sem_undo *su; -+ struct cpt_object_hdr v; -+ loff_t saved_obj; -+ -+ cpt_open_object(obj, ctx); -+ -+ v.cpt_next = CPT_NULL; -+ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO; -+ v.cpt_hdrlen = sizeof(v); -+ v.cpt_content = CPT_CONTENT_ARRAY; -+ -+ ctx->write(&v, sizeof(v), ctx); -+ -+ cpt_push_object(&saved_obj, ctx); -+ for (su = semu->proc_list; su; su = su->proc_next) { -+ if (su->semid != -1) { -+ int err; -+ err = dump_one_semundo(su, ctx); -+ if (err < 0) -+ return err; -+ } -+ } -+ cpt_pop_object(&saved_obj, ctx); -+ -+ cpt_close_object(ctx); -+ } -+ cpt_close_section(ctx); -+ return 0; -+} -+ -+static int collect_one_msg(int id, struct msg_queue *msq, void *arg) -+{ -+ int *retp = arg; -+ (*retp)++; -+ return 0; -+} -+ -+int cpt_collect_sysvmsg(cpt_context_t * ctx) -+{ -+ int ret = 0; -+ sysvipc_walk_msg(collect_one_msg, &ret); -+ if (ret) { -+ eprintk_ctx("SYSV msgqueues are not supported, found %d\n", ret); -+ return -EBUSY; -+ } -+ return 0; -+} -+ -+static int cpt_collect_sysvsem_undo(cpt_context_t *ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ if (tsk->exit_state) { -+ /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list -+ * on exit. Grrr... */ -+ continue; -+ } -+ if (tsk->sysvsem.undo_list && -+ cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL) -+ return -ENOMEM; -+ } -+ -+ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { -+ struct sem_undo_list *semu = obj->o_obj; -+ -+ if (atomic_read(&semu->refcnt) != obj->o_count) { -+ eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt)); -+ return -EBUSY; -+ } -+ } -+ return 0; -+} -+ -+static int collect_one_shm(struct shmid_kernel *shp, void *arg) -+{ -+ cpt_context_t *ctx = arg; -+ -+ if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL) -+ return -ENOMEM; -+ return 0; -+} -+ -+int cpt_collect_sysvshm(cpt_context_t * ctx) -+{ -+ int err; -+ -+ err = sysvipc_walk_shm(collect_one_shm, ctx); -+ -+ return err < 0 ? err : 0; -+} -+ -+int cpt_collect_sysv(cpt_context_t * ctx) -+{ -+ int err; -+ -+ err = cpt_collect_sysvsem_undo(ctx); -+ if (err) -+ return err; -+ err = cpt_collect_sysvmsg(ctx); -+ if (err) -+ return err; -+ err = cpt_collect_sysvshm(ctx); -+ if (err) -+ return err; -+ -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_tty.c linux-2.6.16-026test009/kernel/cpt/cpt_tty.c ---- linux-2.6.16.orig/kernel/cpt/cpt_tty.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_tty.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,216 @@ -+/* -+ * -+ * kernel/cpt/cpt_tty.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/major.h> -+#include <linux/tty.h> -+#include <asm/uaccess.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+/* We must support at least N_TTY. */ -+ -+int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx) -+{ -+ struct tty_struct *tty = file->private_data; -+ cpt_object_t *obj; -+ struct cpt_obj_ref o; -+ loff_t saved_pos; -+ -+ obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx); -+ if (!obj) -+ return -EINVAL; -+ -+ cpt_push_object(&saved_pos, ctx); -+ -+ o.cpt_next = sizeof(o); -+ o.cpt_object = CPT_OBJ_REF; -+ o.cpt_hdrlen = sizeof(o); -+ o.cpt_content = CPT_CONTENT_VOID; -+ o.cpt_pos = obj->o_pos; -+ ctx->write(&o, sizeof(o), ctx); -+ -+ cpt_pop_object(&saved_pos, ctx); -+ -+ return 0; -+} -+ -+int cpt_collect_tty(struct file *file, cpt_context_t * ctx) -+{ -+ struct tty_struct *tty = file->private_data; -+ -+ if (tty) { -+ if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL) -+ return -ENOMEM; -+ if (tty->link) { -+ cpt_object_t *obj; -+ -+ obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx); -+ if (obj == NULL) -+ return -ENOMEM; -+ /* Undo o_count, tty->link is not a reference */ -+ obj->o_count--; -+ } -+ } -+ return 0; -+} -+ -+int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct tty_struct *tty = obj->o_obj; -+ struct cpt_tty_image *v; -+ -+ if (tty->link) { -+ if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) { -+ eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE); -+ return -EINVAL; -+ } -+ if (tty->link->link != tty) { -+ eprintk_ctx("bad pty pair\n"); -+ return -EINVAL; -+ } -+ if (tty->driver->type == TTY_DRIVER_TYPE_PTY && -+ tty->driver->subtype == PTY_TYPE_SLAVE && -+ tty->link->count) -+ obj->o_count++; -+ } -+ if (obj->o_count != tty->count) { -+ eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count); -+ return -EBUSY; -+ } -+ -+ cpt_open_object(obj, ctx); -+ -+ v = cpt_get_buf(ctx); -+ v->cpt_next = -1; -+ v->cpt_object = CPT_OBJ_TTY; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_ARRAY; -+ -+ v->cpt_index = tty->index; -+ v->cpt_link = -1; -+ if (tty->link) -+ v->cpt_link = tty->link->index; -+ v->cpt_drv_type = tty->driver->type; -+ v->cpt_drv_subtype = tty->driver->subtype; -+ v->cpt_drv_flags = tty->driver->flags; -+ v->cpt_packet = tty->packet; -+ v->cpt_stopped = tty->stopped; -+ v->cpt_hw_stopped = tty->hw_stopped; -+ v->cpt_flow_stopped = tty->flow_stopped; -+ v->cpt_flags = tty->flags; -+ v->cpt_ctrl_status = tty->ctrl_status; -+ v->cpt_canon_data = tty->canon_data; -+ v->cpt_canon_head = tty->canon_head - tty->read_tail; -+ v->cpt_canon_column = tty->canon_column; -+ v->cpt_column = tty->column; -+ v->cpt_erasing = tty->erasing; -+ v->cpt_lnext = tty->lnext; -+ v->cpt_icanon = tty->icanon; -+ v->cpt_raw = tty->raw; -+ v->cpt_real_raw = tty->real_raw; -+ v->cpt_closing = tty->closing; -+ v->cpt_minimum_to_wake = tty->minimum_to_wake; -+ v->cpt_pgrp = 0; -+ if (tty->pgrp > 0) { -+ v->cpt_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, tty->pgrp); -+ if ((int)v->cpt_pgrp < 0) { -+ dprintk_ctx("cannot map tty->pgrp %d -> %d\n", tty->pgrp, (int)v->cpt_pgrp); -+ v->cpt_pgrp = -1; -+ } -+ } -+ v->cpt_session = 0; -+ if (tty->session > 0) { -+ v->cpt_session = _pid_type_to_vpid(PIDTYPE_SID, tty->session); -+ if ((int)v->cpt_session < 0) { -+ eprintk_ctx("cannot map tty->session %d -> %d\n", tty->session, (int)v->cpt_session); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ } -+ memcpy(v->cpt_name, tty->name, 64); -+ v->cpt_ws_row = tty->winsize.ws_row; -+ v->cpt_ws_col = tty->winsize.ws_col; -+ v->cpt_ws_prow = tty->winsize.ws_ypixel; -+ v->cpt_ws_pcol = tty->winsize.ws_xpixel; -+ if (tty->termios == NULL) { -+ eprintk_ctx("NULL termios"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ v->cpt_c_line = tty->termios->c_line; -+ v->cpt_c_iflag = tty->termios->c_iflag; -+ v->cpt_c_oflag = tty->termios->c_oflag; -+ v->cpt_c_cflag = tty->termios->c_cflag; -+ v->cpt_c_lflag = tty->termios->c_lflag; -+ memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS); -+ if (NCCS < 32) -+ memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS); -+ memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags)); -+ -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ if (tty->read_buf && tty->read_cnt) { -+ struct cpt_obj_bits *v = cpt_get_buf(ctx); -+ loff_t saved_pos; -+ -+ cpt_push_object(&saved_pos, ctx); -+ cpt_open_object(NULL, ctx); -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_BITS; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_DATA; -+ v->cpt_size = tty->read_cnt; -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_release_buf(ctx); -+ -+ if (tty->read_cnt) { -+ int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail); -+ ctx->write(tty->read_buf + tty->read_tail, n, ctx); -+ if (tty->read_cnt > n) -+ ctx->write(tty->read_buf, tty->read_cnt-n, ctx); -+ ctx->align(ctx); -+ } -+ -+ cpt_close_object(ctx); -+ cpt_pop_object(&saved_pos, ctx); -+ } -+ -+ cpt_close_object(ctx); -+ -+ return 0; -+} -+ -+__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx) -+{ -+ struct tty_struct * tty; -+ struct fasync_struct *fa; -+ -+ tty = (struct tty_struct *)file->private_data; -+ -+ for (fa = tty->fasync; fa; fa = fa->fa_next) { -+ if (fa->fa_file == file) -+ return fa->fa_fd; -+ } -+ return -1; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_ubc.c linux-2.6.16-026test009/kernel/cpt/cpt_ubc.c ---- linux-2.6.16.orig/kernel/cpt/cpt_ubc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_ubc.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,132 @@ -+/* -+ * -+ * kernel/cpt/cpt_ubc.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/types.h> -+#include <ub/beancounter.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx); -+ if (obj != NULL) { -+ if (obj->o_count == 1) -+ get_beancounter(bc); -+ if (bc->parent != NULL && obj->o_parent == NULL) -+ obj->o_parent = cpt_add_ubc(bc->parent, ctx); -+ } -+ return obj; -+} -+ -+__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx); -+ if (obj == NULL) { -+ char buf[48]; -+ print_ub_uid(bc, buf, sizeof(buf)); -+ printk(KERN_ERR "CPT: unknown ub %s (%p)\n", buf, bc); -+ dump_stack(); -+ return CPT_NULL; -+ } -+ return obj->o_pos; -+} -+ -+static void dump_one_bc_parm(__u64 *dmp, struct ubparm *prm, int held) -+{ -+ dmp[0] = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL); -+ dmp[1] = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL); -+ dmp[2] = (held ? prm->held : CPT_NULL); -+ dmp[3] = prm->maxheld; -+ dmp[4] = prm->minheld; -+ dmp[5] = prm->failcnt; -+} -+ -+static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct user_beancounter *bc; -+ struct cpt_beancounter_image *v; -+ int i; -+ -+ bc = obj->o_obj; -+ v = cpt_get_buf(ctx); -+ -+ v->cpt_next = CPT_NULL; -+ v->cpt_object = CPT_OBJ_UBC; -+ v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_VOID; -+ -+ if (obj->o_parent != NULL) -+ v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos; -+ else -+ v->cpt_parent = CPT_NULL; -+ v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0; -+ for (i = 0; i < UB_RESOURCES; i++) -+ dump_one_bc_parm(v->cpt_parms, bc->ub_parms, 0); -+ for (i = 0; i < UB_RESOURCES; i++) -+ dump_one_bc_parm(v->cpt_parms + UB_RESOURCES * 6, -+ bc->ub_store, 1); -+ memset(v->cpt_parms + UB_RESOURCES * 12, 0, -+ sizeof(v->cpt_parms) -+ - UB_RESOURCES * 12 * sizeof(v->cpt_parms[0])); -+ -+ cpt_open_object(obj, ctx); -+ ctx->write(v, sizeof(*v), ctx); -+ cpt_close_object(ctx); -+ -+ cpt_release_buf(ctx); -+ return 0; -+} -+ -+int cpt_dump_ubc(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ int skipped; -+ int top; -+ -+ cpt_open_section(ctx, CPT_SECT_UBC); -+ -+ do { -+ skipped = 0; -+ top = 0; -+ for_each_object(obj, CPT_OBJ_UBC) { -+ if (obj->o_parent == NULL) -+ top++; -+ if (obj->o_pos != CPT_NULL) -+ continue; -+ if (obj->o_parent != NULL && -+ ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL) -+ skipped++; -+ else -+ dump_one_bc(obj, ctx); -+ } -+ } while (skipped && (top < 2)); -+ -+ cpt_close_section(ctx); -+ if (top > 1) { -+ eprintk_ctx("More than one top level ub exist"); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+void cpt_finish_ubc(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_UBC) -+ put_beancounter(obj->o_obj); -+} -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_ubc.h linux-2.6.16-026test009/kernel/cpt/cpt_ubc.h ---- linux-2.6.16.orig/kernel/cpt/cpt_ubc.h 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_ubc.h 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,9 @@ -+cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx); -+__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx); -+int cpt_dump_ubc(struct cpt_context *ctx); -+ -+struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx); -+int rst_undump_ubc(struct cpt_context *ctx); -+ -+void cpt_finish_ubc(struct cpt_context *ctx); -+void rst_finish_ubc(struct cpt_context *ctx); -diff -upr linux-2.6.16.orig/kernel/cpt/cpt_x8664.S linux-2.6.16-026test009/kernel/cpt/cpt_x8664.S ---- linux-2.6.16.orig/kernel/cpt/cpt_x8664.S 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/cpt_x8664.S 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,61 @@ -+#define ASSEMBLY 1 -+#include <linux/config.h> -+#include <linux/linkage.h> -+#include <asm/segment.h> -+#include <asm/smp.h> -+#include <asm/cache.h> -+#include <asm/errno.h> -+#include <asm/dwarf2.h> -+#include <asm/calling.h> -+#include <asm/msr.h> -+#include <asm/unistd.h> -+#include <asm/thread_info.h> -+#include <asm/hw_irq.h> -+#include <asm/errno.h> -+ -+ .code64 -+ -+ .macro FAKE_STACK_FRAME child_rip -+ /* push in order ss, rsp, eflags, cs, rip */ -+ xorq %rax, %rax -+ pushq %rax /* ss */ -+ pushq %rax /* rsp */ -+ pushq $(1<<9) /* eflags - interrupts on */ -+ pushq $__KERNEL_CS /* cs */ -+ pushq \child_rip /* rip */ -+ pushq %rax /* orig rax */ -+ .endm -+ -+ .macro UNFAKE_STACK_FRAME -+ addq $8*6, %rsp -+ .endm -+ -+ENTRY(asm_kernel_thread) -+ FAKE_STACK_FRAME $child_rip -+ SAVE_ALL -+ -+ # rdi: flags, rsi: usp, rdx: will be &pt_regs -+ movq %rdx,%rdi -+ orq $0x00800000,%rdi -+ movq $-1, %rsi -+ movq %rsp, %rdx -+ -+ xorl %r8d,%r8d -+ xorl %r9d,%r9d -+ pushq %rcx -+ call do_fork_pid -+ addq $8, %rsp -+ /* call do_fork */ -+ movq %rax,RAX(%rsp) -+ xorl %edi,%edi -+ RESTORE_ALL -+ UNFAKE_STACK_FRAME -+ ret -+ -+child_rip: -+ movq %rdi, %rax -+ movq %rsi, %rdi -+ call *%rax -+ xorq %rdi, %rdi -+ xorq %rsi, %rsi -+ call complete_and_exit -diff -upr linux-2.6.16.orig/kernel/cpt/rst_conntrack.c linux-2.6.16-026test009/kernel/cpt/rst_conntrack.c ---- linux-2.6.16.orig/kernel/cpt/rst_conntrack.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_conntrack.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,294 @@ -+/* -+ * -+ * kernel/cpt/rst_conntrack.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/fs.h> -+#include <linux/socket.h> -+#include <linux/netdevice.h> -+#include <linux/inetdevice.h> -+#include <linux/rtnetlink.h> -+#include <linux/unistd.h> -+#include <linux/ve.h> -+#include <linux/vzcalluser.h> -+#include <linux/cpt_image.h> -+#include <linux/icmp.h> -+#include <linux/ip.h> -+ -+#if defined(CONFIG_VE_IPTABLES) && \ -+ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) -+ -+#include <linux/netfilter.h> -+#include <linux/netfilter_ipv4/ip_conntrack.h> -+#include <linux/netfilter_ipv4/ip_nat.h> -+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -+#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -+#include <linux/netfilter_ipv4/ip_conntrack_core.h> -+#include <linux/netfilter_ipv4/ip_nat_helper.h> -+#include <linux/netfilter_ipv4/ip_nat_core.h> -+ -+#define ASSERT_READ_LOCK(x) do { } while (0) -+#define ASSERT_WRITE_LOCK(x) do { } while (0) -+ -+#include <linux/netfilter_ipv4/listhelp.h> -+ -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+struct ct_holder -+{ -+ struct ct_holder *next; -+ struct ip_conntrack *ct; -+ int index; -+}; -+ -+static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir) -+{ -+ tuple->dst.ip = v->cpt_dst; -+ tuple->dst.u.all = v->cpt_dstport; -+ tuple->dst.protonum = v->cpt_protonum; -+ tuple->dst.dir = v->cpt_dir; -+ if (dir != tuple->dst.dir) -+ wprintk("dir != tuple->dst.dir\n"); -+ -+ tuple->src.ip = v->cpt_src; -+ tuple->src.u.all = v->cpt_srcport; -+} -+ -+ -+static int undump_expect_list(struct ip_conntrack *ct, -+ struct cpt_ip_conntrack_image *ci, -+ loff_t pos, struct ct_holder *ct_list, -+ cpt_context_t *ctx) -+{ -+ loff_t end; -+ int err; -+ -+ end = pos + ci->cpt_next; -+ pos += ci->cpt_hdrlen; -+ while (pos < end) { -+ struct cpt_ip_connexpect_image v; -+ struct ip_conntrack_expect *exp; -+ struct ip_conntrack *sibling; -+ -+ err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx); -+ if (err) -+ return err; -+ -+ sibling = NULL; -+ if (v.cpt_sibling_conntrack) { -+ struct ct_holder *c; -+ -+ for (c = ct_list; c; c = c->next) { -+ if (c->index == v.cpt_sibling_conntrack) { -+ sibling = c->ct; -+ break; -+ } -+ } -+ if (!sibling) { -+ eprintk_ctx("lost sibling of expectation\n"); -+ return -EINVAL; -+ } -+ } -+ -+ write_lock_bh(&ip_conntrack_lock); -+ -+ /* It is possible. Helper module could be just unregistered, -+ * if expectation were on the list, it would be destroyed. */ -+ if (ct->helper == NULL) { -+ write_unlock_bh(&ip_conntrack_lock); -+ dprintk_ctx("conntrack: no helper and non-trivial expectation\n"); -+ continue; -+ } -+ -+ exp = ip_conntrack_expect_alloc(NULL); -+ if (exp == NULL) { -+ write_unlock_bh(&ip_conntrack_lock); -+ return -ENOMEM; -+ } -+ -+ if (ct->helper->timeout && !del_timer(&exp->timeout)) { -+ /* Dying already. We can do nothing. */ -+ write_unlock_bh(&ip_conntrack_lock); -+ dprintk_ctx("conntrack expectation is dying\n"); -+ continue; -+ } -+ -+ decode_tuple(&v.cpt_tuple, &exp->tuple, 0); -+ decode_tuple(&v.cpt_mask, &exp->mask, 0); -+ -+ exp->master = ct; -+ nf_conntrack_get(&ct->ct_general); -+ ip_conntrack_expect_insert(exp); -+#if 0 -+ if (sibling) { -+ exp->sibling = sibling; -+ sibling->master = exp; -+ LIST_DELETE(&ve_ip_conntrack_expect_list, exp); -+ ct->expecting--; -+ nf_conntrack_get(&master_ct(sibling)->infos[0]); -+ } else -+#endif -+ if (ct->helper->timeout) { -+ exp->timeout.expires = jiffies + v.cpt_timeout; -+ add_timer(&exp->timeout); -+ } -+ write_unlock_bh(&ip_conntrack_lock); -+ -+ pos += v.cpt_next; -+ } -+ return 0; -+} -+ -+static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos, -+ struct ct_holder **ct_list, cpt_context_t *ctx) -+{ -+ int err = 0; -+ struct ip_conntrack *conntrack; -+ struct ct_holder *c; -+ struct ip_conntrack_tuple orig, repl; -+ -+ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); -+ if (c == NULL) -+ return -ENOMEM; -+ -+ decode_tuple(&ci->cpt_tuple[0], &orig, 0); -+ decode_tuple(&ci->cpt_tuple[1], &repl, 1); -+ -+ conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub); -+ if (!conntrack) { -+ kfree(c); -+ return -ENOMEM; -+ } -+ -+ c->ct = conntrack; -+ c->next = *ct_list; -+ *ct_list = c; -+ c->index = ci->cpt_index; -+ -+ decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0); -+ decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1); -+ -+ conntrack->status = ci->cpt_status; -+ -+ memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto)); -+ memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help)); -+ -+#ifdef CONFIG_IP_NF_NAT_NEEDED -+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ -+ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) -+ conntrack->nat.masq_index = ci->cpt_masq_index; -+#endif -+ if (ci->cpt_initialized) { -+ conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos; -+ conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before; -+ conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after; -+ conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos; -+ conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before; -+ conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after; -+ } -+ if (conntrack->status & IPS_NAT_DONE_MASK) -+ ip_nat_hash_conntrack(conntrack); -+#endif -+ -+ write_lock_bh(&ip_conntrack_lock); -+ -+ if (ci->cpt_ct_helper) { -+ conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple); -+ if (conntrack->helper == NULL) { -+ eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n"); -+ err = -EINVAL; -+ } -+ } -+ -+ ip_conntrack_hash_insert(conntrack); -+ conntrack->timeout.expires = jiffies + ci->cpt_timeout; -+ -+ write_unlock_bh(&ip_conntrack_lock); -+ -+ if (err == 0 && ci->cpt_next > ci->cpt_hdrlen) -+ err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx); -+ -+ return err; -+} -+ -+int rst_restore_ip_conntrack(struct cpt_context * ctx) -+{ -+ int err = 0; -+ loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ struct cpt_ip_conntrack_image ci; -+ struct ct_holder *c; -+ struct ct_holder *ct_list = NULL; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) { -+ eprintk_ctx("conntrack module ct->proto version mismatch\n"); -+ return -EINVAL; -+ } -+ if (sizeof(ci.cpt_help_data) != sizeof(union ip_conntrack_help)) { -+ eprintk_ctx("conntrack module ct->help version mismatch\n"); -+ return -EINVAL; -+ } -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx); -+ if (err) -+ break; -+ err = undump_one_ct(&ci, sec, &ct_list, ctx); -+ if (err) -+ break; -+ sec += ci.cpt_next; -+ } -+ -+ while ((c = ct_list) != NULL) { -+ ct_list = c->next; -+ if (c->ct) -+ add_timer(&c->ct->timeout); -+ kfree(c); -+ } -+ -+ return err; -+} -+ -+#else -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+int rst_restore_ip_conntrack(struct cpt_context * ctx) -+{ -+ if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL) -+ return -EINVAL; -+ return 0; -+} -+ -+#endif -diff -upr linux-2.6.16.orig/kernel/cpt/rst_context.c linux-2.6.16-026test009/kernel/cpt/rst_context.c ---- linux-2.6.16.orig/kernel/cpt/rst_context.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_context.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,315 @@ -+/* -+ * -+ * kernel/cpt/rst_context.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/pagemap.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx) -+{ -+ mm_segment_t oldfs; -+ ssize_t err = -EBADF; -+ struct file *file = ctx->file; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ if (file) -+ err = file->f_op->read(file, addr, count, &file->f_pos); -+ set_fs(oldfs); -+ if (err != count) -+ return err >= 0 ? -EIO : err; -+ return 0; -+} -+ -+static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) -+{ -+ mm_segment_t oldfs; -+ ssize_t err = -EBADF; -+ struct file *file = ctx->file; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ if (file) -+ err = file->f_op->read(file, addr, count, &pos); -+ set_fs(oldfs); -+ if (err != count) -+ return err >= 0 ? -EIO : err; -+ return 0; -+} -+ -+static void file_align(struct cpt_context *ctx) -+{ -+ struct file *file = ctx->file; -+ -+ if (file) -+ file->f_pos = CPT_ALIGN(file->f_pos); -+} -+ -+int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end) -+{ -+ struct cpt_section_hdr hdr; -+ int err; -+ loff_t pos; -+ -+ pos = ctx->sections[type]; -+ *start = *end = pos; -+ -+ if (pos != CPT_NULL) { -+ if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0) -+ return err; -+ if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr)) -+ return -EINVAL; -+ *start = pos + hdr.cpt_hdrlen; -+ *end = pos + hdr.cpt_next; -+ } -+ return 0; -+} -+EXPORT_SYMBOL(rst_get_section); -+ -+void rst_context_init(struct cpt_context *ctx) -+{ -+ int i; -+ -+ memset(ctx, 0, sizeof(*ctx)); -+ -+ init_MUTEX(&ctx->main_sem); -+ ctx->refcount = 1; -+ -+ ctx->current_section = -1; -+ ctx->current_object = -1; -+ ctx->pagesize = PAGE_SIZE; -+ ctx->read = file_read; -+ ctx->pread = file_pread; -+ ctx->align = file_align; -+ for (i=0; i < CPT_SECT_MAX; i++) -+ ctx->sections[i] = CPT_NULL; -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ init_completion(&ctx->pgin_notify); -+#endif -+ cpt_object_init(ctx); -+} -+ -+static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx) -+{ -+ struct cpt_section_hdr h; -+ -+ while (start < end) { -+ int err; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, start); -+ if (err) -+ return err; -+ if (h.cpt_hdrlen < sizeof(h) || -+ h.cpt_next < h.cpt_hdrlen || -+ start + h.cpt_next > end) -+ return -EINVAL; -+ if (h.cpt_section >= CPT_SECT_MAX) -+ return -EINVAL; -+ ctx->sections[h.cpt_section] = start; -+ start += h.cpt_next; -+ } -+ return 0; -+} -+ -+int rst_open_dumpfile(struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_major_tail *v; -+ struct cpt_major_hdr h; -+ unsigned long size; -+ -+ err = -EBADF; -+ if (!ctx->file) -+ goto err_out; -+ -+ err = -ENOMEM; -+ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); -+ if (ctx->tmpbuf == NULL) -+ goto err_out; -+ __cpt_release_buf(ctx); -+ -+ size = ctx->file->f_dentry->d_inode->i_size; -+ -+ if (size & 7) { -+ err = -EINVAL; -+ goto err_out; -+ } -+ if (size < sizeof(struct cpt_major_hdr) + -+ sizeof(struct cpt_major_tail)) { -+ err = -EINVAL; -+ goto err_out; -+ } -+ err = ctx->pread(&h, sizeof(h), ctx, 0); -+ if (err) { -+ eprintk_ctx("too short image 1 %d\n", err); -+ goto err_out; -+ } -+ if (h.cpt_signature[0] != CPT_SIGNATURE0 || -+ h.cpt_signature[1] != CPT_SIGNATURE1 || -+ h.cpt_signature[2] != CPT_SIGNATURE2 || -+ h.cpt_signature[3] != CPT_SIGNATURE3) { -+ err = -EINVAL; -+ goto err_out; -+ } -+ if (h.cpt_hz != HZ) { -+ err = -EINVAL; -+ eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ); -+ goto err_out; -+ } -+ ctx->virt_jiffies64 = h.cpt_start_jiffies64; -+ ctx->start_time.tv_sec = h.cpt_start_sec; -+ ctx->start_time.tv_nsec = h.cpt_start_nsec; -+ ctx->kernel_config_flags = h.cpt_kernel_config[0]; -+ ctx->iptables_mask = h.cpt_iptables_mask; -+ ctx->image_version = h.cpt_image_version; -+ -+ v = cpt_get_buf(ctx); -+ err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v)); -+ if (err) { -+ eprintk_ctx("too short image 2 %d\n", err); -+ cpt_release_buf(ctx); -+ goto err_out; -+ } -+ if (v->cpt_signature[0] != CPT_SIGNATURE0 || -+ v->cpt_signature[1] != CPT_SIGNATURE1 || -+ v->cpt_signature[2] != CPT_SIGNATURE2 || -+ v->cpt_signature[3] != CPT_SIGNATURE3 || -+ v->cpt_nsect != CPT_SECT_MAX_INDEX) { -+ err = -EINVAL; -+ cpt_release_buf(ctx); -+ goto err_out; -+ } -+ if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) { -+ cpt_release_buf(ctx); -+ goto err_out; -+ } -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ ctx->lazypages = v->cpt_lazypages; -+#endif -+ ctx->tasks64 = v->cpt_64bit; -+ cpt_release_buf(ctx); -+ return 0; -+ -+err_out: -+ if (ctx->tmpbuf) { -+ free_page((unsigned long)ctx->tmpbuf); -+ ctx->tmpbuf = NULL; -+ } -+ return err; -+} -+ -+void rst_close_dumpfile(struct cpt_context *ctx) -+{ -+ if (ctx->file) { -+ fput(ctx->file); -+ ctx->file = NULL; -+ } -+ if (ctx->tmpbuf) { -+ free_page((unsigned long)ctx->tmpbuf); -+ ctx->tmpbuf = NULL; -+ } -+} -+ -+int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_object_hdr *hdr = tmp; -+ err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos); -+ if (err) -+ return err; -+ if (type > 0 && type != hdr->cpt_object) -+ return -EINVAL; -+ if (hdr->cpt_hdrlen > hdr->cpt_next) -+ return -EINVAL; -+ if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr)) -+ return -EINVAL; -+ if (size < sizeof(*hdr)) -+ return -EINVAL; -+ if (size > hdr->cpt_hdrlen) -+ size = hdr->cpt_hdrlen; -+ if (size > sizeof(*hdr)) -+ err = ctx->pread(hdr+1, size - sizeof(*hdr), -+ ctx, pos + sizeof(*hdr)); -+ return err; -+} -+EXPORT_SYMBOL(_rst_get_object); -+ -+void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx) -+{ -+ int err; -+ void *tmp; -+ struct cpt_object_hdr hdr; -+ err = ctx->pread(&hdr, sizeof(hdr), ctx, pos); -+ if (err) -+ return NULL; -+ if (type > 0 && type != hdr.cpt_object) -+ return NULL; -+ if (hdr.cpt_hdrlen > hdr.cpt_next) -+ return NULL; -+ if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr)) -+ return NULL; -+ tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL); -+ if (!tmp) -+ return NULL; -+ err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos); -+ if (!err) -+ return tmp; -+ kfree(tmp); -+ return NULL; -+} -+EXPORT_SYMBOL(__rst_get_object); -+ -+__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_object_hdr hdr; -+ __u8 *name; -+ -+ err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx); -+ if (err) -+ return NULL; -+ if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE) -+ return NULL; -+ name = (void*)__get_free_page(GFP_KERNEL); -+ if (!name) -+ return NULL; -+ err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen, -+ ctx, *pos_p + hdr.cpt_hdrlen); -+ if (err) { -+ free_page((unsigned long)name); -+ return NULL; -+ } -+ *pos_p += hdr.cpt_next; -+ return name; -+} -+ -+__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx) -+{ -+ return __rst_get_name(&pos, ctx); -+} -+ -+void rst_put_name(__u8 *name, struct cpt_context *ctx) -+{ -+ unsigned long addr = (unsigned long)name; -+ -+ if (addr) -+ free_page(addr&~(PAGE_SIZE-1)); -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_epoll.c linux-2.6.16-026test009/kernel/cpt/rst_epoll.c ---- linux-2.6.16.orig/kernel/cpt/rst_epoll.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_epoll.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,173 @@ -+/* -+ * -+ * kernel/cpt/rst_epoll.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/major.h> -+#include <linux/pipe_fs_i.h> -+#include <linux/mman.h> -+#include <linux/namespace.h> -+#include <linux/mount.h> -+#include <linux/namei.h> -+#include <linux/smp_lock.h> -+#include <asm/uaccess.h> -+#include <linux/vzcalluser.h> -+#include <linux/eventpoll.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_files.h" -+#include "cpt_kernel.h" -+#include "cpt_fsmagic.h" -+#include "cpt_syscalls.h" -+ -+/* Those funcations are static in fs/eventpoll.c */ -+extern struct file_operations eventpoll_fops; -+extern int ep_insert(struct eventpoll *ep, struct epoll_event *event, -+ struct file *tfile, int fd); -+extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); -+extern void ep_release_epitem(struct epitem *epi); -+ -+ -+struct file *cpt_open_epolldev(struct cpt_file_image *fi, -+ unsigned flags, -+ struct cpt_context *ctx) -+{ -+ struct file *file; -+ int efd; -+ -+ /* Argument "size" is ignored, use just 1 */ -+ efd = sys_epoll_create(1); -+ if (efd < 0) -+ return ERR_PTR(efd); -+ -+ file = fget(efd); -+ sys_close(efd); -+ return file; -+} -+ -+static int restore_one_epoll(cpt_object_t *obj, -+ loff_t pos, -+ struct cpt_epoll_image *ebuf, -+ cpt_context_t *ctx) -+{ -+ int err = 0; -+ loff_t endpos; -+ struct file *file = obj->o_obj; -+ struct eventpoll *ep; -+ -+ if (file->f_op != &eventpoll_fops) { -+ eprintk_ctx("bad epoll file\n"); -+ return -EINVAL; -+ } -+ -+ ep = file->private_data; -+ -+ if (unlikely(ep == NULL)) { -+ eprintk_ctx("bad epoll device\n"); -+ return -EINVAL; -+ } -+ -+ endpos = pos + ebuf->cpt_next; -+ pos += ebuf->cpt_hdrlen; -+ while (pos < endpos) { -+ struct cpt_epoll_file_image efi; -+ struct epoll_event epds; -+ -+ cpt_object_t *tobj; -+ -+ err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx); -+ if (err) -+ return err; -+ tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx); -+ if (!tobj) { -+ eprintk_ctx("epoll file not found\n"); -+ return -EINVAL; -+ } -+ epds.events = efi.cpt_events; -+ epds.data = efi.cpt_data; -+ down_write(&ep->sem); -+ err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd); -+ if (!err) { -+ struct epitem *epi; -+ epi = ep_find(ep, tobj->o_obj, efi.cpt_fd); -+ if (epi) { -+ epi->revents = efi.cpt_revents; -+ if (efi.cpt_ready) { -+ unsigned long flags; -+ write_lock_irqsave(&ep->lock, flags); -+ if (list_empty(&epi->rdllink)) -+ list_add_tail(&epi->rdllink, &ep->rdllist); -+ write_unlock_irqrestore(&ep->lock, flags); -+ } -+ ep_release_epitem(epi); -+ } -+ } -+ up_write(&ep->sem); -+ if (err) -+ break; -+ pos += efi.cpt_next; -+ } -+ return err; -+} -+ -+int rst_eventpoll(cpt_context_t *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_EPOLL]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ cpt_object_t *obj; -+ struct cpt_epoll_image *ebuf = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx); -+ if (obj == NULL) { -+ eprintk_ctx("cannot find epoll file object\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ err = restore_one_epoll(obj, sec, ebuf, ctx); -+ cpt_release_buf(ctx); -+ if (err) -+ return err; -+ sec += ebuf->cpt_next; -+ } -+ -+ return 0; -+ -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_files.c linux-2.6.16-026test009/kernel/cpt/rst_files.c ---- linux-2.6.16.orig/kernel/cpt/rst_files.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_files.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,1447 @@ -+/* -+ * -+ * kernel/cpt/rst_files.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/major.h> -+#include <linux/pipe_fs_i.h> -+#include <linux/mman.h> -+#include <linux/mount.h> -+#include <linux/tty.h> -+#include <linux/namei.h> -+#include <linux/vmalloc.h> -+#include <linux/smp_lock.h> -+#include <linux/vmalloc.h> -+#include <linux/pagemap.h> -+#include <asm/uaccess.h> -+#include <ub/ub_mem.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_files.h" -+#include "cpt_kernel.h" -+#include "cpt_fsmagic.h" -+ -+#include "cpt_syscalls.h" -+ -+ -+struct filejob { -+ struct filejob *next; -+ int pid; -+ loff_t fdi; -+}; -+ -+static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx) -+{ -+ struct filejob *j; -+ -+ j = kmalloc(sizeof(*j), GFP_KERNEL); -+ if (j == NULL) -+ return -ENOMEM; -+ j->pid = current->pid; -+ j->fdi = pos; -+ j->next = ctx->filejob_queue; -+ ctx->filejob_queue = j; -+ return 0; -+} -+ -+static void _anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) -+{ -+ struct page *page = buf->page; -+ -+ if (info->tmp_page) { -+ __free_page(page); -+ } else { -+ info->tmp_page = page; -+ } -+ module_put(THIS_MODULE); -+} -+ -+static void *_anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) -+{ -+ return kmap(buf->page); -+} -+ -+static void _anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) -+{ -+ kunmap(buf->page); -+} -+ -+static struct pipe_buf_operations _anon_pipe_buf_ops = { -+ .can_merge = 1, -+ .map = _anon_pipe_buf_map, -+ .unmap = _anon_pipe_buf_unmap, -+ .release = _anon_pipe_buf_release, -+}; -+ -+/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer -+ * many times. We need to mark it in CPT_OBJ_INODE table in some way. -+ */ -+static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi, -+ struct cpt_context *ctx) -+{ -+ struct inode *ino = file->f_dentry->d_inode; -+ struct cpt_inode_image ii; -+ struct cpt_obj_bits b; -+ struct pipe_inode_info *info; -+ int err; -+ int count; -+ -+ if (!S_ISFIFO(ino->i_mode)) { -+ eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", fi->cpt_inode); -+ return -EINVAL; -+ } -+ if (fi->cpt_inode == CPT_NULL) -+ return 0; -+ -+ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); -+ if (err) -+ return err; -+ -+ if (ii.cpt_next <= ii.cpt_hdrlen) -+ return 0; -+ -+ err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx); -+ if (err) -+ return err; -+ -+ if (b.cpt_size == 0) -+ return 0; -+ -+ mutex_lock(PIPE_MUTEX(*ino)); -+ info = ino->i_pipe; -+ if (info->nrbufs) { -+ mutex_unlock(PIPE_MUTEX(*ino)); -+ eprintk("pipe buffer is restored already\n"); -+ return -EINVAL; -+ } -+ info->curbuf = 0; -+ count = 0; -+ while (count < b.cpt_size) { -+ struct pipe_buffer *buf = info->bufs + info->nrbufs; -+ void * addr; -+ int chars; -+ -+ chars = b.cpt_size - count; -+ if (chars > PAGE_SIZE) -+ chars = PAGE_SIZE; -+ if (!try_module_get(THIS_MODULE)) { -+ err = -EBUSY; -+ break; -+ } -+ -+ buf->page = alloc_page(GFP_HIGHUSER); -+ if (buf->page == NULL) { -+ err = -ENOMEM; -+ break; -+ } -+ buf->ops = &_anon_pipe_buf_ops; -+ buf->offset = 0; -+ buf->len = chars; -+ info->nrbufs++; -+ addr = kmap(buf->page); -+ err = ctx->pread(addr, chars, ctx, -+ fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count); -+ if (err) -+ break; -+ count += chars; -+ } -+ mutex_unlock(PIPE_MUTEX(*ino)); -+ -+ return err; -+} -+ -+static int make_flags(struct cpt_file_image *fi) -+{ -+ int flags = O_NOFOLLOW; -+ switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) { -+ case FMODE_READ|FMODE_WRITE: -+ flags |= O_RDWR; break; -+ case FMODE_WRITE: -+ flags |= O_WRONLY; break; -+ case FMODE_READ: -+ flags |= O_RDONLY; break; -+ default: break; -+ } -+ flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC); -+ flags |= O_NONBLOCK|O_NOCTTY; -+ return flags; -+} -+ -+static struct file *open_pipe(char *name, -+ struct cpt_file_image *fi, -+ unsigned flags, -+ struct cpt_context *ctx) -+{ -+ int err; -+ cpt_object_t *obj; -+ struct cpt_inode_image ii; -+ struct file *rf, *wf; -+ -+ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); -+ if (err) -+ return ERR_PTR(err); -+ -+ if (ii.cpt_sb == FSMAGIC_PIPEFS) { -+ int pfd[2]; -+ -+ if ((err = sc_pipe(pfd)) < 0) -+ return ERR_PTR(err); -+ -+ rf = fcheck(pfd[0]); -+ wf = fcheck(pfd[1]); -+ get_file(rf); -+ get_file(wf); -+ sc_close(pfd[0]); -+ sc_close(pfd[1]); -+ -+ if (fi->cpt_mode&FMODE_READ) { -+ struct file *tf; -+ tf = wf; wf = rf; rf = tf; -+ } -+ } else { -+ if (fi->cpt_mode&FMODE_READ) { -+ rf = filp_open(name, flags, 0); -+ if (IS_ERR(rf)) { -+ dprintk_ctx("filp_open\n"); -+ return rf; -+ } -+ dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current), fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode); -+ return rf; -+ } -+ -+ dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), fi->cpt_inode); -+ -+ rf = filp_open(name, O_RDWR|O_NONBLOCK, 0); -+ if (IS_ERR(rf)) -+ return rf; -+ wf = dentry_open(dget(rf->f_dentry), -+ mntget(rf->f_vfsmnt), flags); -+ } -+ -+ /* Add pipe inode to obj table. */ -+ obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx); -+ if (obj == NULL) { -+ fput(rf); fput(wf); -+ return ERR_PTR(-ENOMEM); -+ } -+ cpt_obj_setpos(obj, fi->cpt_inode, ctx); -+ obj->o_parent = rf; -+ -+ /* Add another side of pipe to obj table, it will not be used -+ * (o_pos = PT_NULL), another processes opeining pipe will find -+ * inode and open it with dentry_open(). */ -+ obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx); -+ if (obj == NULL) { -+ fput(wf); -+ return ERR_PTR(-ENOMEM); -+ } -+ return wf; -+} -+ -+static struct file *open_special(struct cpt_file_image *fi, -+ unsigned flags, -+ int deleted, -+ struct cpt_context *ctx) -+{ -+ struct cpt_inode_image *ii; -+ struct file *file; -+ -+ /* Directories and named pipes are not special actually */ -+ if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode)) -+ return NULL; -+ -+ /* No support for block devices at the moment. */ -+ if (S_ISBLK(fi->cpt_i_mode)) -+ return ERR_PTR(-EINVAL); -+ -+ if (S_ISSOCK(fi->cpt_i_mode)) { -+ eprintk_ctx("bug: socket is not open\n"); -+ return ERR_PTR(-EINVAL); -+ } -+ -+ /* Support only (some) character devices at the moment. */ -+ if (!S_ISCHR(fi->cpt_i_mode)) -+ return ERR_PTR(-EINVAL); -+ -+ ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); -+ if (ii == NULL) -+ return ERR_PTR(-ENOMEM); -+ -+ /* Do not worry about this right now. /dev/null,zero,*random are here. -+ * To prohibit at least /dev/mem? -+ */ -+ if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) { -+ kfree(ii); -+ return NULL; -+ } -+ -+ file = rst_open_tty(fi, ii, flags, ctx); -+ kfree(ii); -+ return file; -+} -+ -+static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx) -+{ -+ struct file_lock lock; -+ cpt_object_t *obj; -+ -+ memset(&lock, 0, sizeof(lock)); -+ lock.fl_type = fli->cpt_type; -+ lock.fl_flags = fli->cpt_flags & ~FL_SLEEP; -+ lock.fl_start = fli->cpt_start; -+ lock.fl_end = fli->cpt_end; -+ obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx); -+ if (!obj) { -+ eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner); -+ return -EINVAL; -+ } -+ lock.fl_owner = obj->o_obj; -+ lock.fl_pid = vpid_to_pid(fli->cpt_pid); -+ if (lock.fl_pid < 0) { -+ eprintk_ctx("unknown lock pid %d\n", lock.fl_pid); -+ return -EINVAL; -+ } -+ lock.fl_file = file; -+ -+ if (lock.fl_owner == NULL) -+ eprintk_ctx("no lock owner\n"); -+ return posix_lock_file(file, &lock); -+} -+ -+static int restore_flock(struct file *file, struct cpt_flock_image *fli, -+ cpt_context_t *ctx) -+{ -+ int cmd, err, fd; -+ fd = get_unused_fd(); -+ if (fd < 0) { -+ eprintk_ctx("BSD flock cannot be restored\n"); -+ return fd; -+ } -+ get_file(file); -+ fd_install(fd, file); -+ if (fli->cpt_type == F_RDLCK) { -+ cmd = LOCK_SH; -+ } else if (fli->cpt_type == F_WRLCK) { -+ cmd = LOCK_EX; -+ } else { -+ eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type); -+ sc_close(fd); -+ return -EINVAL; -+ } -+ -+ err = sc_flock(fd, LOCK_NB | cmd); -+ sc_close(fd); -+ return err; -+} -+ -+ -+static int fixup_posix_locks(struct file *file, -+ struct cpt_file_image *fi, -+ loff_t pos, struct cpt_context *ctx) -+{ -+ int err; -+ loff_t end; -+ struct cpt_flock_image fli; -+ -+ end = pos + fi->cpt_next; -+ pos += fi->cpt_hdrlen; -+ while (pos < end) { -+ err = rst_get_object(-1, pos, &fli, ctx); -+ if (err) -+ return err; -+ if (fli.cpt_object == CPT_OBJ_FLOCK && -+ (fli.cpt_flags&FL_POSIX)) { -+ err = restore_posix_lock(file, &fli, ctx); -+ if (err) -+ return err; -+ dprintk_ctx("posix lock restored\n"); -+ } -+ pos += fli.cpt_next; -+ } -+ return 0; -+} -+ -+int rst_posix_locks(struct cpt_context *ctx) -+{ -+ int err; -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file = obj->o_obj; -+ struct cpt_file_image fi; -+ -+ if (obj->o_pos == CPT_NULL) -+ continue; -+ -+ err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx); -+ if (err < 0) -+ return err; -+ if (fi.cpt_next > fi.cpt_hdrlen) -+ fixup_posix_locks(file, &fi, obj->o_pos, ctx); -+ } -+ return 0; -+} -+ -+static int fixup_flocks(struct file *file, -+ struct cpt_file_image *fi, -+ loff_t pos, struct cpt_context *ctx) -+{ -+ int err; -+ loff_t end; -+ struct cpt_flock_image fli; -+ -+ end = pos + fi->cpt_next; -+ pos += fi->cpt_hdrlen; -+ while (pos < end) { -+ err = rst_get_object(-1, pos, &fli, ctx); -+ if (err) -+ return err; -+ if (fli.cpt_object == CPT_OBJ_FLOCK && -+ (fli.cpt_flags&FL_FLOCK)) { -+ err = restore_flock(file, &fli, ctx); -+ if (err) -+ return err; -+ dprintk_ctx("bsd lock restored\n"); -+ } -+ pos += fli.cpt_next; -+ } -+ return 0; -+} -+ -+ -+static int fixup_reg_data(struct file *file, loff_t pos, loff_t end, -+ struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_page_block pgb; -+ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); -+ -+ do_write = file->f_op->write; -+ if (do_write == NULL) { -+ eprintk_ctx("no write method. Cannot restore contents of the file.\n"); -+ return -EINVAL; -+ } -+ -+ atomic_inc(&file->f_count); -+ -+ while (pos < end) { -+ loff_t opos; -+ loff_t ipos; -+ int count; -+ -+ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); -+ if (err) -+ goto out; -+ dprintk_ctx("restoring file data block: %08x-%08x\n", -+ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); -+ ipos = pos + pgb.cpt_hdrlen; -+ opos = pgb.cpt_start; -+ count = pgb.cpt_end-pgb.cpt_start; -+ while (count > 0) { -+ mm_segment_t oldfs; -+ int copy = count; -+ -+ if (copy > PAGE_SIZE) -+ copy = PAGE_SIZE; -+ (void)cpt_get_buf(ctx); -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); -+ set_fs(oldfs); -+ if (err) { -+ __cpt_release_buf(ctx); -+ goto out; -+ } -+ if (!(file->f_mode & FMODE_WRITE) || -+ (file->f_flags&O_DIRECT)) { -+ fput(file); -+ file = dentry_open(dget(file->f_dentry), -+ mntget(file->f_vfsmnt), O_WRONLY); -+ if (IS_ERR(file)) { -+ __cpt_release_buf(ctx); -+ return PTR_ERR(file); -+ } -+ } -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ ipos += copy; -+ err = do_write(file, ctx->tmpbuf, copy, &opos); -+ set_fs(oldfs); -+ __cpt_release_buf(ctx); -+ if (err != copy) { -+ if (err >= 0) -+ err = -EIO; -+ goto out; -+ } -+ count -= copy; -+ } -+ pos += pgb.cpt_next; -+ } -+ err = 0; -+ -+out: -+ fput(file); -+ return err; -+} -+ -+ -+static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi, -+ struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_inode_image ii; -+ struct file *file = *file_p; -+ struct iattr newattrs; -+ -+ if (!S_ISREG(fi->cpt_i_mode)) -+ return 0; -+ -+ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); -+ if (err) -+ return err; -+ -+ if (file == NULL) { -+ file = shmem_file_setup("dev/zero", ii.cpt_size, 0); -+ if (IS_ERR(file)) -+ return PTR_ERR(file); -+ *file_p = file; -+ } -+ -+ if (ii.cpt_next > ii.cpt_hdrlen) { -+ err = fixup_reg_data(file, fi->cpt_inode+ii.cpt_hdrlen, -+ fi->cpt_inode+ii.cpt_next, ctx); -+ if (err) -+ return err; -+ } -+ -+ mutex_lock(&file->f_dentry->d_inode->i_mutex); -+ /* stage 1 - update size like do_truncate does */ -+ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; -+ newattrs.ia_size = ii.cpt_size; -+ cpt_timespec_import(&newattrs.ia_ctime, ii.cpt_ctime); -+ err = notify_change(file->f_dentry, &newattrs); -+ if (err) -+ goto out; -+ -+ /* stage 2 - update times */ -+ newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME | -+ ATTR_ATIME_SET | ATTR_MTIME_SET; -+ cpt_timespec_import(&newattrs.ia_atime, ii.cpt_atime); -+ cpt_timespec_import(&newattrs.ia_mtime, ii.cpt_mtime); -+ err = notify_change(file->f_dentry, &newattrs); -+ -+out: -+ mutex_unlock(&file->f_dentry->d_inode->i_mutex); -+ return err; -+} -+ -+static int fixup_file_flags(struct file *file, struct cpt_file_image *fi, -+ int was_dentry_open, loff_t pos, -+ cpt_context_t *ctx) -+{ -+ if (fi->cpt_pos != file->f_pos) { -+ int err = -ESPIPE; -+ if (file->f_op->llseek) -+ err = file->f_op->llseek(file, fi->cpt_pos, 0); -+ if (err < 0) { -+ dprintk_ctx("file %Ld lseek %Ld - %Ld\n", pos, file->f_pos, fi->cpt_pos); -+ file->f_pos = fi->cpt_pos; -+ } -+ } -+ file->f_uid = fi->cpt_uid; -+ file->f_gid = fi->cpt_gid; -+ file->f_owner.pid = 0; -+ if (fi->cpt_fown_pid) { -+ file->f_owner.pid = comb_vpid_to_pid(fi->cpt_fown_pid); -+ if (file->f_owner.pid == 0) { -+ wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n", file->f_owner.pid); -+ return -EINVAL; -+ } -+ } -+ file->f_owner.uid = fi->cpt_fown_uid; -+ file->f_owner.euid = fi->cpt_fown_euid; -+ file->f_owner.signum = fi->cpt_fown_signo; -+ -+ if (file->f_mode != fi->cpt_mode) { -+ if (was_dentry_open && -+ ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) { -+ file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK); -+ file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK); -+ } -+ if (file->f_mode != fi->cpt_mode) -+ wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode); -+ } -+ if (file->f_flags != fi->cpt_flags) { -+ if (!(fi->cpt_flags&O_NOFOLLOW)) -+ file->f_flags &= ~O_NOFOLLOW; -+ if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) { -+ file->f_flags &= ~O_NONBLOCK; -+ file->f_flags |= fi->cpt_flags&O_NONBLOCK; -+ } -+ if (fi->cpt_flags&FASYNC) { -+ if (fi->cpt_fown_fd == -1) { -+ wprintk_ctx("No fd for FASYNC\n"); -+ return -EINVAL; -+ } else if (file->f_op && file->f_op->fasync) { -+ if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) { -+ wprintk_ctx("FASYNC problem\n"); -+ return -EINVAL; -+ } else { -+ file->f_flags |= FASYNC; -+ } -+ } -+ } -+ if (file->f_flags != fi->cpt_flags) { -+ eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags); -+ return -EINVAL; -+ } -+ } -+ return 0; -+} -+ -+static struct file * -+open_deleted(char *name, unsigned flags, struct cpt_file_image *fi, -+ cpt_context_t *ctx) -+{ -+ struct file * file; -+ char *suffix = NULL; -+ int attempt = 0; -+ int tmp_pass = 0; -+ mode_t mode = fi->cpt_i_mode; -+ -+ /* Strip (deleted) part... */ -+ if (strlen(name) > strlen(" (deleted)")) { -+ if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) { -+ suffix = &name[strlen(name) - strlen(" (deleted)")]; -+ *suffix = 0; -+ } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) { -+ memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1); -+ suffix = name + strlen(name); -+ } -+ } -+ -+try_again: -+ for (;;) { -+ if (attempt) { -+ if (attempt > 1000) { -+ eprintk_ctx("open_deleted: failed after %d attempts\n", attempt); -+ return ERR_PTR(-EEXIST); -+ } -+ if (suffix == NULL) { -+ eprintk_ctx("open_deleted: no suffix\n"); -+ return ERR_PTR(-EEXIST); -+ } -+ sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt)); -+ } -+ attempt++; -+ -+ if (S_ISFIFO(mode)) { -+ int err; -+ err = sc_mknod(name, S_IFIFO|(mode&017777), 0); -+ if (err == -EEXIST) -+ continue; -+ if (err < 0 && !tmp_pass) -+ goto change_dir; -+ if (err < 0) -+ return ERR_PTR(err); -+ file = open_pipe(name, fi, flags, ctx); -+ sc_unlink(name); -+ } else if (S_ISCHR(mode)) { -+ int err; -+ struct cpt_inode_image *ii; -+ -+ ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); -+ if (ii == NULL) -+ return ERR_PTR(-ENOMEM); -+ err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev)); -+ kfree(ii); -+ if (err == -EEXIST) -+ continue; -+ if (err < 0 && !tmp_pass) -+ goto change_dir; -+ if (err < 0) -+ return ERR_PTR(err); -+ file = filp_open(name, flags, mode&017777); -+ sc_unlink(name); -+ } else if (S_ISDIR(mode)) { -+ int err; -+ err = sc_mkdir(name, mode&017777); -+ if (err == -EEXIST) -+ continue; -+ if (err < 0 && !tmp_pass) -+ goto change_dir; -+ if (err < 0) -+ return ERR_PTR(err); -+ file = filp_open(name, flags, mode&017777); -+ sc_rmdir(name); -+ } else { -+ file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777); -+ if (IS_ERR(file)) { -+ if (PTR_ERR(file) == -EEXIST) -+ continue; -+ if (!tmp_pass) -+ goto change_dir; -+ } else { -+ sc_unlink(name); -+ } -+ } -+ break; -+ } -+ -+ if (IS_ERR(file)) { -+ eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file)); -+ return file; -+ } else { -+ dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode); -+ } -+ return file; -+ -+change_dir: -+ sprintf(name, "/tmp/rst%u", current->pid); -+ suffix = name + strlen(name); -+ attempt = 1; -+ tmp_pass = 1; -+ goto try_again; -+} -+ -+struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx) -+{ -+ int err; -+ int was_dentry_open = 0; -+ cpt_object_t *obj; -+ cpt_object_t *iobj; -+ struct cpt_file_image fi; -+ __u8 *name = NULL; -+ struct file *file; -+ int flags; -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx); -+ if (obj) { -+ file = obj->o_obj; -+ if (obj->o_index >= 0) { -+ dprintk_ctx("file is attached to a socket\n"); -+ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); -+ if (err < 0) -+ goto err_out; -+ fixup_file_flags(file, &fi, 0, pos, ctx); -+ } -+ get_file(file); -+ return file; -+ } -+ -+ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); -+ if (err < 0) -+ goto err_out; -+ -+ flags = make_flags(&fi); -+ -+ /* Easy way, inode has been already open. */ -+ if (fi.cpt_inode != CPT_NULL && -+ !(fi.cpt_lflags & CPT_DENTRY_CLONING) && -+ (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL && -+ iobj->o_parent) { -+ struct file *filp = iobj->o_parent; -+ file = dentry_open(dget(filp->f_dentry), -+ mntget(filp->f_vfsmnt), flags); -+ dprintk_ctx("rst_file: file obtained by dentry_open\n"); -+ was_dentry_open = 1; -+ goto map_file; -+ } -+ -+ if (fi.cpt_next > fi.cpt_hdrlen) -+ name = rst_get_name(pos + sizeof(fi), ctx); -+ -+ if (fi.cpt_lflags == CPT_DENTRY_DELETED) { -+ if (fi.cpt_inode == CPT_NULL) { -+ eprintk_ctx("deleted file and no inode.\n"); -+ err = -EINVAL; -+ goto err_out; -+ } -+ -+ /* One very special case... */ -+ if (S_ISREG(fi.cpt_i_mode) && -+ (!name || !name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) { -+ /* MAP_ANON|MAP_SHARED mapping. -+ * kernel makes this damn ugly way, when file which -+ * is passed to mmap by user does not match -+ * file finally attached to VMA. Ok, rst_mm -+ * has to take care of this. Otherwise, it will fail. -+ */ -+ file = NULL; -+ } else if (S_ISREG(fi.cpt_i_mode) || -+ S_ISCHR(fi.cpt_i_mode) || -+ S_ISFIFO(fi.cpt_i_mode) || -+ S_ISDIR(fi.cpt_i_mode)) { -+ if (S_ISCHR(fi.cpt_i_mode)) { -+ file = open_special(&fi, flags, 1, ctx); -+ if (file != NULL) -+ goto map_file; -+ } -+ file = open_deleted(name, flags, &fi, ctx); -+ if (IS_ERR(file)) -+ goto out; -+ } else { -+ eprintk_ctx("not a regular deleted file.\n"); -+ err = -EINVAL; -+ goto err_out; -+ } -+ -+ err = fixup_file_content(&file, &fi, ctx); -+ if (err) -+ goto err_put; -+ goto map_file; -+ } else { -+ if (!name || !name[0]) { -+ eprintk_ctx("no name for file?\n"); -+ err = -EINVAL; -+ goto err_out; -+ } -+ if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) && -+ (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL) -+ goto map_file; -+ if (S_ISFIFO(fi.cpt_i_mode) && -+ (file = open_pipe(name, &fi, flags, ctx)) != NULL) -+ goto map_file; -+ if (!S_ISREG(fi.cpt_i_mode) && -+ (file = open_special(&fi, flags, 0, ctx)) != NULL) -+ goto map_file; -+ } -+ -+ file = filp_open(name, flags, 0); -+ -+map_file: -+ if (!IS_ERR(file)) { -+ fixup_file_flags(file, &fi, was_dentry_open, pos, ctx); -+ -+ if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) { -+ err = fixup_pipe_data(file, &fi, ctx); -+ if (err) -+ goto err_put; -+ } -+ -+ obj = cpt_object_get(CPT_OBJ_FILE, file, ctx); -+ if (!obj) { -+ obj = cpt_object_add(CPT_OBJ_FILE, file, ctx); -+ if (obj) -+ get_file(file); -+ } -+ if (obj) -+ cpt_obj_setpos(obj, pos, ctx); -+ -+ obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); -+ if (obj) { -+ cpt_obj_setpos(obj, fi.cpt_inode, ctx); -+ if (!obj->o_parent || fi.cpt_lflags != CPT_DENTRY_DELETED) -+ obj->o_parent = file; -+ } -+ -+ if (fi.cpt_next > fi.cpt_hdrlen) { -+ err = fixup_flocks(file, &fi, pos, ctx); -+ if (err) -+ goto err_put; -+ } -+ } else { -+ if (fi.cpt_lflags & CPT_DENTRY_PROC) { -+ dprintk_ctx("rst_file /proc delayed\n"); -+ file = NULL; -+ } -+ } -+ -+out: -+ if (name) -+ rst_put_name(name, ctx); -+ return file; -+ -+err_put: -+ if (file) -+ fput(file); -+err_out: -+ if (name) -+ rst_put_name(name, ctx); -+ return ERR_PTR(err); -+} -+ -+ -+__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ __u32 flag = 0; -+ -+ if (ti->cpt_files == CPT_NULL || -+ lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx)) -+ flag |= CLONE_FILES; -+ if (ti->cpt_fs == CPT_NULL || -+ lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx)) -+ flag |= CLONE_FS; -+ return flag; -+} -+ -+static void local_close_files(struct files_struct * files) -+{ -+ int i, j; -+ -+ j = 0; -+ for (;;) { -+ unsigned long set; -+ i = j * __NFDBITS; -+ if (i >= files->fdt->max_fdset || i >= files->fdt->max_fds) -+ break; -+ set = files->fdt->open_fds->fds_bits[j]; -+ while (set) { -+ if (set & 1) { -+ struct file * file = xchg(&files->fdt->fd[i], NULL); -+ if (file) -+ filp_close(file, files); -+ } -+ i++; -+ set >>= 1; -+ } -+ files->fdt->open_fds->fds_bits[j] = 0; -+ files->fdt->close_on_exec->fds_bits[j] = 0; -+ j++; -+ } -+} -+ -+extern int expand_fdtable(struct files_struct *files, int nr); -+ -+ -+int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ struct cpt_files_struct_image fi; -+ struct files_struct *f = current->files; -+ cpt_object_t *obj; -+ loff_t pos, endpos; -+ int err; -+ -+ if (ti->cpt_files == CPT_NULL) { -+ current->files = NULL; -+ if (f) -+ put_files_struct(f); -+ return 0; -+ } -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx); -+ if (obj) { -+ if (obj->o_obj != f) { -+ put_files_struct(f); -+ f = obj->o_obj; -+ atomic_inc(&f->count); -+ current->files = f; -+ } -+ return 0; -+ } -+ -+ err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); -+ if (err) -+ return err; -+ -+ local_close_files(f); -+ -+ if (fi.cpt_max_fds > f->fdt->max_fds) { -+ spin_lock(&f->file_lock); -+ err = expand_fdtable(f, fi.cpt_max_fds-1); -+ spin_unlock(&f->file_lock); -+ if (err) -+ return err; -+ } -+ -+ pos = ti->cpt_files + fi.cpt_hdrlen; -+ endpos = ti->cpt_files + fi.cpt_next; -+ while (pos < endpos) { -+ struct cpt_fd_image fdi; -+ struct file *filp; -+ -+ err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx); -+ if (err) -+ return err; -+ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); -+ if (IS_ERR(filp)) { -+ eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), fdi.cpt_file); -+ return PTR_ERR(filp); -+ } -+ if (filp == NULL) { -+ int err = rst_filejob_queue(pos, ctx); -+ if (err) -+ return err; -+ } else { -+ if (fdi.cpt_fd >= f->fdt->max_fds) BUG(); -+ f->fdt->fd[fdi.cpt_fd] = filp; -+ FD_SET(fdi.cpt_fd, f->fdt->open_fds); -+ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) -+ FD_SET(fdi.cpt_fd, f->fdt->close_on_exec); -+ } -+ pos += fdi.cpt_next; -+ } -+ f->fdt->next_fd = fi.cpt_next_fd; -+ -+ obj = cpt_object_add(CPT_OBJ_FILES, f, ctx); -+ if (obj) { -+ cpt_obj_setpos(obj, ti->cpt_files, ctx); -+ cpt_obj_setindex(obj, fi.cpt_index, ctx); -+ } -+ return 0; -+} -+ -+int rst_do_filejobs(cpt_context_t *ctx) -+{ -+ struct filejob *j; -+ -+ while ((j = ctx->filejob_queue) != NULL) { -+ int err; -+ task_t *tsk; -+ struct cpt_fd_image fdi; -+ struct file *filp; -+ -+ read_lock(&tasklist_lock); -+ tsk = find_task_by_pid_ve(j->pid); -+ if (tsk) -+ get_task_struct(tsk); -+ read_unlock(&tasklist_lock); -+ if (!tsk) -+ return -EINVAL; -+ -+ err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx); -+ if (err) { -+ put_task_struct(tsk); -+ return err; -+ } -+ -+ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); -+ if (tsk->files->fdt->fd[fdi.cpt_fd] || -+ FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) { -+ eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi); -+ put_task_struct(tsk); -+ return -EBUSY; -+ } -+ -+ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); -+ if (IS_ERR(filp)) { -+ eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), fdi.cpt_file); -+ put_task_struct(tsk); -+ return PTR_ERR(filp); -+ } -+ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); -+ tsk->files->fdt->fd[fdi.cpt_fd] = filp; -+ FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds); -+ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) -+ FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec); -+ -+ dprintk_ctx("filejob %Ld done\n", j->fdi); -+ -+ put_task_struct(tsk); -+ ctx->filejob_queue = j->next; -+ kfree(j); -+ } -+ return 0; -+} -+ -+void rst_flush_filejobs(cpt_context_t *ctx) -+{ -+ struct filejob *j; -+ -+ while ((j = ctx->filejob_queue) != NULL) { -+ ctx->filejob_queue = j->next; -+ kfree(j); -+ } -+} -+ -+int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ struct fs_struct *f = current->fs; -+ cpt_object_t *obj; -+ -+ if (ti->cpt_fs == CPT_NULL) { -+ exit_fs(current); -+ return 0; -+ } -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx); -+ if (obj) { -+ if (obj->o_obj != f) { -+ exit_fs(current); -+ f = obj->o_obj; -+ atomic_inc(&f->count); -+ current->fs = f; -+ } -+ return 0; -+ } -+ -+ /* Do _not_ restore root. Image contains absolute pathnames. -+ * So, we fix it in context of rst process. -+ */ -+ -+ obj = cpt_object_add(CPT_OBJ_FS, f, ctx); -+ if (obj) -+ cpt_obj_setpos(obj, ti->cpt_fs, ctx); -+ -+ return 0; -+} -+ -+static int get_dir(struct dentry **dp, struct vfsmount **mp, -+ loff_t *pos, struct cpt_context *ctx) -+{ -+ struct cpt_file_image fi; -+ struct file * file; -+ int err; -+ -+ err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx); -+ if (err) -+ return err; -+ -+ file = rst_file(*pos, -1, ctx); -+ if (IS_ERR(file)) -+ return PTR_ERR(file); -+ -+ *dp = dget(file->f_dentry); -+ *mp = mntget(file->f_vfsmnt); -+ *pos += fi.cpt_next; -+ fput(file); -+ return 0; -+} -+ -+static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, -+ struct dentry *dentry) -+{ -+ struct dentry *old_root; -+ struct vfsmount *old_rootmnt; -+ write_lock(&fs->lock); -+ old_root = fs->root; -+ old_rootmnt = fs->rootmnt; -+ fs->rootmnt = mnt; -+ fs->root = dentry; -+ write_unlock(&fs->lock); -+ if (old_root) { -+ dput(old_root); -+ mntput(old_rootmnt); -+ } -+} -+ -+static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, -+ struct dentry *dentry) -+{ -+ struct dentry *old_pwd; -+ struct vfsmount *old_pwdmnt; -+ -+ write_lock(&fs->lock); -+ old_pwd = fs->pwd; -+ old_pwdmnt = fs->pwdmnt; -+ fs->pwdmnt = mnt; -+ fs->pwd = dentry; -+ write_unlock(&fs->lock); -+ -+ if (old_pwd) { -+ dput(old_pwd); -+ mntput(old_pwdmnt); -+ } -+} -+ -+ -+int rst_restore_fs(struct cpt_context *ctx) -+{ -+ loff_t pos; -+ cpt_object_t *obj; -+ int err = 0; -+ -+ for_each_object(obj, CPT_OBJ_FS) { -+ struct cpt_fs_struct_image fi; -+ struct fs_struct *fs = obj->o_obj; -+ int i; -+ struct dentry *d[3]; -+ struct vfsmount *m[3]; -+ -+ err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx); -+ if (err) -+ return err; -+ -+ fs->umask = fi.cpt_umask; -+ -+ pos = obj->o_pos + fi.cpt_hdrlen; -+ d[0] = d[1] = d[2] = NULL; -+ m[0] = m[1] = m[2] = NULL; -+ i = 0; -+ while (pos < obj->o_pos + fi.cpt_next && i<3) { -+ err = get_dir(d+i, m+i, &pos, ctx); -+ if (err) { -+ eprintk_ctx("cannot get_dir: %d", err); -+ break; -+ } -+ i++; -+ } -+ if (d[0]) -+ __set_fs_root(fs, m[0], d[0]); -+ if (d[1]) -+ __set_fs_pwd(fs, m[1], d[1]); -+ if (d[2]) { -+ struct dentry *olddentry; -+ struct vfsmount *oldmnt; -+ write_lock(&fs->lock); -+ oldmnt = fs->altrootmnt; -+ olddentry = fs->altroot; -+ fs->altrootmnt = m[2]; -+ fs->altroot = d[2]; -+ write_unlock(&fs->lock); -+ -+ if (olddentry) { -+ dput(olddentry); -+ mntput(oldmnt); -+ } -+ } -+ } -+ return err; -+} -+ -+int do_one_mount(char *mntpnt, char *mnttype, char *mntbind, unsigned long flags, struct cpt_context *ctx) -+{ -+ int err; -+ -+ if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0)) -+ mntbind = NULL; -+ -+ if (mntbind) -+ flags |= MS_BIND; -+ -+ err = sc_mount(mntbind, mntpnt, mnttype, flags); -+ if (err < 0) { -+ eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags); -+ return err; -+ } -+ return 0; -+} -+ -+static int undumptmpfs(void *arg) -+{ -+ int i; -+ int *pfd = arg; -+ char *argv[] = { "tar", "x", "-C", "/", "-S", NULL }; -+ -+ if (pfd[0] != 0) -+ sc_dup2(pfd[0], 0); -+ -+ for (i=1; i<current->files->fdt->max_fds; i++) -+ sc_close(i); -+ -+ module_put(THIS_MODULE); -+ -+ set_fs(KERNEL_DS); -+ i = sc_execve("/bin/tar", argv, NULL); -+ eprintk("failed to exec /bin/tar: %d\n", i); -+ return -1; -+} -+ -+static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx) -+{ -+ int err; -+ int pfd[2]; -+ struct file *f; -+ struct cpt_object_hdr v; -+ int n; -+ loff_t end; -+ int pid; -+ -+ err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx); -+ if (err < 0) -+ return err; -+ -+ err = sc_pipe(pfd); -+ if (err < 0) -+ return err; -+ pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0); -+ if (err < 0) -+ goto out; -+ f = fget(pfd[1]); -+ sc_close(pfd[1]); -+ sc_close(pfd[0]); -+ -+ ctx->file->f_pos = *pos + v.cpt_hdrlen; -+ end = *pos + v.cpt_next; -+ *pos += v.cpt_next; -+ do { -+ char buf[16]; -+ mm_segment_t oldfs; -+ -+ n = end - ctx->file->f_pos; -+ if (n > sizeof(buf)) -+ n = sizeof(buf); -+ -+ if (ctx->read(buf, n, ctx)) -+ break; -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ f->f_op->write(f, buf, n, &f->f_pos); -+ set_fs(oldfs); -+ } while (ctx->file->f_pos < end); -+ -+ fput(f); -+ -+ clear_tsk_thread_flag(current,TIF_SIGPENDING); -+ -+ if ((err = sc_waitx(pid, 0)) < 0) -+ eprintk_ctx("wait4: %d\n", err); -+ -+ return 0; -+ -+out: -+ if (pfd[1] >= 0) -+ sc_close(pfd[1]); -+ if (pfd[0] >= 0) -+ sc_close(pfd[0]); -+ return err; -+} -+ -+int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx) -+{ -+ int err; -+ loff_t endpos; -+ -+ endpos = pos + mi->cpt_next; -+ pos += mi->cpt_hdrlen; -+ -+ while (pos < endpos) { -+ char *mntdev; -+ char *mntpnt; -+ char *mnttype; -+ char *mntbind; -+ -+ mntdev = __rst_get_name(&pos, ctx); -+ mntpnt = __rst_get_name(&pos, ctx); -+ mnttype = __rst_get_name(&pos, ctx); -+ mntbind = __rst_get_name(&pos, ctx); -+ err = -EINVAL; -+ if (mnttype && mntpnt) { -+ err = 0; -+ if (strcmp(mntpnt, "/")) -+ err = do_one_mount(mntpnt, mnttype, mntbind, mi->cpt_flags, ctx); -+ if (strcmp(mnttype, "tmpfs") == 0) { -+ rst_restore_tmpfs(&pos, ctx); -+ } -+ } -+ if (mntdev) -+ rst_put_name(mntdev, ctx); -+ if (mntpnt) -+ rst_put_name(mntpnt, ctx); -+ if (mnttype) -+ rst_put_name(mnttype, ctx); -+ if (mntbind) -+ rst_put_name(mntbind, ctx); -+ if (err) -+ return err; -+ } -+ return 0; -+} -+ -+int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_vfsmount_image mi; -+ -+ while (pos < endpos) { -+ err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx); -+ if (err) -+ return err; -+ err = restore_one_vfsmount(&mi, pos, ctx); -+ if (err) -+ return err; -+ pos += mi.cpt_next; -+ } -+ return 0; -+} -+ -+int rst_root_namespace(struct cpt_context *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_NAMESPACE]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ struct cpt_object_hdr sbuf; -+ int done = 0; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx); -+ if (err) -+ return err; -+ if (done) { -+ eprintk_ctx("multiple namespaces are not supported\n"); -+ break; -+ } -+ done++; -+ err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx); -+ if (err) -+ return err; -+ sec += sbuf.cpt_next; -+ } -+ -+ return 0; -+} -+ -+int rst_stray_files(struct cpt_context *ctx) -+{ -+ int err = 0; -+ loff_t sec = ctx->sections[CPT_SECT_FILES]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ struct cpt_object_hdr sbuf; -+ cpt_object_t *obj; -+ -+ err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx); -+ if (err) -+ break; -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx); -+ if (!obj) { -+ struct file *file; -+ -+ dprintk_ctx("stray file %Ld\n", sec); -+ -+ file = rst_sysv_shm(sec, ctx); -+ -+ if (IS_ERR(file)) { -+ eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file)); -+ return PTR_ERR(file); -+ } else { -+ fput(file); -+ } -+ } -+ sec += sbuf.cpt_next; -+ } -+ -+ return err; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_i386.S linux-2.6.16-026test009/kernel/cpt/rst_i386.S ---- linux-2.6.16.orig/kernel/cpt/rst_i386.S 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_i386.S 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,40 @@ -+#define ASSEMBLY 1 -+ -+#include <linux/config.h> -+#include <linux/linkage.h> -+#include <asm/thread_info.h> -+#include <asm/errno.h> -+#include <asm/segment.h> -+#include <asm/page.h> -+#include <asm/smp.h> -+#include <asm/page.h> -+ -+ .section .text -+ .align 4 -+ .global ret_last_siginfo -+ret_last_siginfo: -+ call rlsi -+ movl %eax,%esp -+ ret -+ -+ .align 8 -+ .global ret_child_tid -+ret_child_tid: -+ push %esp -+ call rct -+ movl %eax,%esp -+ ret -+ -+ .align 4 -+ .global ret_from_rst -+ret_from_rst: -+ pushl %eax -+ jmp ret_from_fork+6 -+ -+ .align 4 -+ .global pre_ret_from_fork -+pre_ret_from_fork: -+ pushl %eax -+ call schedule_tail -+ popl %eax -+ ret -diff -upr linux-2.6.16.orig/kernel/cpt/rst_mm.c linux-2.6.16-026test009/kernel/cpt/rst_mm.c ---- linux-2.6.16.orig/kernel/cpt/rst_mm.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_mm.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,985 @@ -+/* -+ * -+ * kernel/cpt/rst_mm.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/virtinfo.h> -+#include <linux/hugetlb.h> -+#include <linux/errno.h> -+#include <linux/errno.h> -+#include <linux/pagemap.h> -+#include <linux/mman.h> -+#include <linux/vmalloc.h> -+#include <linux/rmap.h> -+#include <linux/hash.h> -+#include <asm/pgalloc.h> -+#include <asm/tlb.h> -+#include <asm/tlbflush.h> -+#include <asm/pgtable.h> -+#include <asm/mmu.h> -+#include <asm/ldt.h> -+#include <asm/desc.h> -+#include <asm/mmu_context.h> -+#include <linux/swapops.h> -+#include <linux/cpt_image.h> -+ -+#ifdef CONFIG_VE -+#include <ub/beancounter.h> -+#include <ub/ub_vmpages.h> -+#endif -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_files.h" -+#include "cpt_ubc.h" -+#include "cpt_mm.h" -+#include "cpt_kernel.h" -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+#include "cpt_pagein.h" -+#endif -+ -+#include "cpt_syscalls.h" -+ -+#define __PAGE_NX (1ULL<<63) -+ -+static unsigned long make_prot(struct cpt_vma_image *vmai) -+{ -+ unsigned long prot = 0; -+ -+ if (vmai->cpt_flags&VM_READ) -+ prot |= PROT_READ; -+ if (vmai->cpt_flags&VM_WRITE) -+ prot |= PROT_WRITE; -+ if (vmai->cpt_flags&VM_EXEC) -+ prot |= PROT_EXEC; -+ if (vmai->cpt_flags&VM_GROWSDOWN) -+ prot |= PROT_GROWSDOWN; -+ if (vmai->cpt_flags&VM_GROWSUP) -+ prot |= PROT_GROWSUP; -+ return prot; -+} -+ -+static unsigned long make_flags(struct cpt_vma_image *vmai) -+{ -+ unsigned long flags = MAP_FIXED; -+ -+ if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE)) -+ flags |= MAP_SHARED; -+ else -+ flags |= MAP_PRIVATE; -+ -+ if (vmai->cpt_file == CPT_NULL) -+ flags |= MAP_ANONYMOUS; -+ if (vmai->cpt_flags&VM_GROWSDOWN) -+ flags |= MAP_GROWSDOWN; -+ if (vmai->cpt_flags&VM_DENYWRITE) -+ flags |= MAP_DENYWRITE; -+ if (vmai->cpt_flags&VM_EXECUTABLE) -+ flags |= MAP_EXECUTABLE; -+ if (!(vmai->cpt_flags&VM_ACCOUNT)) -+ flags |= MAP_NORESERVE; -+ return flags; -+} -+ -+ -+#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) -+static int __alloc_ldt(mm_context_t *pc, int mincount) -+{ -+ int oldsize, newsize, i; -+ -+ if (mincount <= pc->size) -+ return 0; -+ /* -+ * LDT got larger - reallocate if necessary. -+ */ -+ oldsize = pc->size; -+ mincount = (mincount+511)&(~511); -+ newsize = mincount*LDT_ENTRY_SIZE; -+ for (i = 0; i < newsize; i += PAGE_SIZE) { -+ int nr = i/PAGE_SIZE; -+ BUG_ON(i >= 64*1024); -+ if (!pc->ldt_pages[nr]) { -+ pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC); -+ if (!pc->ldt_pages[nr]) -+ return -ENOMEM; -+ clear_highpage(pc->ldt_pages[nr]); -+ } -+ } -+ pc->size = mincount; -+ return 0; -+} -+ -+static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) -+{ -+ struct mm_struct *mm = current->mm; -+ int i; -+ int err; -+ int size; -+ -+ err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE); -+ if (err) -+ return err; -+ -+ size = mm->context.size*LDT_ENTRY_SIZE; -+ -+ for (i = 0; i < size; i += PAGE_SIZE) { -+ int nr = i / PAGE_SIZE, bytes; -+ char *kaddr = kmap(mm->context.ldt_pages[nr]); -+ -+ bytes = size - i; -+ if (bytes > PAGE_SIZE) -+ bytes = PAGE_SIZE; -+ err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i); -+ kunmap(mm->context.ldt_pages[nr]); -+ if (err) -+ return err; -+ } -+ -+ load_LDT(&mm->context); -+ return 0; -+} -+ -+#else -+ -+static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) -+{ -+ struct mm_struct *mm = current->mm; -+ int oldsize = mm->context.size; -+ void *oldldt; -+ void *newldt; -+ int err; -+ -+ if (li->cpt_size > PAGE_SIZE) -+ newldt = vmalloc(li->cpt_size); -+ else -+ newldt = kmalloc(li->cpt_size, GFP_KERNEL); -+ -+ if (!newldt) -+ return -ENOMEM; -+ -+ err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen); -+ if (err) -+ return err; -+ -+ oldldt = mm->context.ldt; -+ mm->context.ldt = newldt; -+ mm->context.size = li->cpt_size/LDT_ENTRY_SIZE; -+ -+ load_LDT(&mm->context); -+ -+ if (oldsize) { -+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) -+ vfree(oldldt); -+ else -+ kfree(oldldt); -+ } -+ return 0; -+} -+#endif -+ -+static int -+restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg) -+{ -+ struct aio_ring_info *info = &aio_ctx->ring_info; -+ unsigned nr_events = aio_ctx->max_reqs; -+ unsigned long size; -+ int nr_pages; -+ -+ /* We recalculate parameters of the ring exactly like -+ * fs/aio.c does and then compare calculated values -+ * with ones, stored in dump. They must be the same. */ -+ -+ nr_events += 2; -+ -+ size = sizeof(struct aio_ring); -+ size += sizeof(struct io_event) * nr_events; -+ nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; -+ -+ if (nr_pages != aimg->cpt_ring_pages) -+ return -EINVAL; -+ -+ info->nr_pages = nr_pages; -+ -+ nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); -+ -+ if (nr_events != aimg->cpt_nr) -+ return -EINVAL; -+ -+ info->nr = 0; -+ info->ring_pages = info->internal_pages; -+ if (nr_pages > AIO_RING_PAGES) { -+ info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); -+ if (!info->ring_pages) -+ return -ENOMEM; -+ memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); -+ } -+ -+ info->mmap_size = nr_pages * PAGE_SIZE; -+ -+ /* This piece of shit is not entirely my fault. Kernel aio.c makes -+ * something odd mmap()ping some pages and then pinning them. -+ * I guess it is just some mud remained of failed attempt to show ring -+ * to user space. The result is odd. :-) Immediately after -+ * creation of AIO context, kernel shares those pages with user -+ * and user can read and even write there. But after the first -+ * fork, pages are marked COW with evident consequences. -+ * I remember, I did the same mistake in the first version -+ * of mmapped packet socket, luckily that crap never reached -+ * mainstream. -+ * -+ * So, what are we going to do? I can simulate this odd behaviour -+ * exactly, but I am not insane yet. For now just take the pages -+ * from user space. Alternatively, we could keep kernel copy -+ * in AIO context image, which would be more correct. -+ * -+ * What is wrong now? If the pages are COWed, ring is transferred -+ * incorrectly. -+ */ -+ down_read(¤t->mm->mmap_sem); -+ info->mmap_base = aimg->cpt_mmap_base; -+ info->nr_pages = get_user_pages(current, current->mm, -+ info->mmap_base, nr_pages, -+ 1, 0, info->ring_pages, NULL); -+ up_read(¤t->mm->mmap_sem); -+ -+ if (unlikely(info->nr_pages != nr_pages)) { -+ int i; -+ -+ for (i=0; i<info->nr_pages; i++) -+ put_page(info->ring_pages[i]); -+ if (info->ring_pages && info->ring_pages != info->internal_pages) -+ kfree(info->ring_pages); -+ return -EFAULT; -+ } -+ -+ aio_ctx->user_id = info->mmap_base; -+ -+ info->nr = nr_events; -+ info->tail = aimg->cpt_tail; -+ -+ return 0; -+} -+ -+static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx) -+{ -+ int err; -+ struct kioctx *aio_ctx; -+ extern spinlock_t aio_nr_lock; -+ -+ aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); -+ if (!aio_ctx) -+ return -ENOMEM; -+ -+ memset(aio_ctx, 0, sizeof(*aio_ctx)); -+ aio_ctx->max_reqs = aimg->cpt_max_reqs; -+ -+ if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) { -+ kmem_cache_free(kioctx_cachep, aio_ctx); -+ eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err); -+ return err; -+ } -+ -+ aio_ctx->mm = current->mm; -+ atomic_inc(&aio_ctx->mm->mm_count); -+ atomic_set(&aio_ctx->users, 1); -+ spin_lock_init(&aio_ctx->ctx_lock); -+ spin_lock_init(&aio_ctx->ring_info.ring_lock); -+ init_waitqueue_head(&aio_ctx->wait); -+ INIT_LIST_HEAD(&aio_ctx->active_reqs); -+ INIT_LIST_HEAD(&aio_ctx->run_list); -+ INIT_WORK(&aio_ctx->wq, aio_kick_handler, ctx); -+ -+ spin_lock(&aio_nr_lock); -+ aio_nr += aio_ctx->max_reqs; -+ spin_unlock(&aio_nr_lock); -+ -+ write_lock(&aio_ctx->mm->ioctx_list_lock); -+ aio_ctx->next = aio_ctx->mm->ioctx_list; -+ aio_ctx->mm->ioctx_list = aio_ctx; -+ write_unlock(&aio_ctx->mm->ioctx_list_lock); -+ -+ return 0; -+} -+ -+struct anonvma_map -+{ -+ struct hlist_node list; -+ struct anon_vma *avma; -+ __u64 id; -+}; -+ -+static int verify_create_anonvma(struct mm_struct *mm, -+ struct cpt_vma_image *vmai, -+ cpt_context_t *ctx) -+{ -+ struct anon_vma *avma = NULL; -+ struct anon_vma *new_avma; -+ struct vm_area_struct *vma; -+ int h; -+ -+ if (!ctx->anonvmas) { -+ if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE) -+ return -EINVAL; -+ if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL) -+ return -ENOMEM; -+ for (h = 0; h < CPT_ANONVMA_HSIZE; h++) -+ INIT_HLIST_HEAD(&ctx->anonvmas[h]); -+ } else { -+ struct anonvma_map *map; -+ struct hlist_node *elem; -+ -+ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); -+ hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) { -+ if (map->id == vmai->cpt_anonvmaid) { -+ avma = map->avma; -+ break; -+ } -+ } -+ } -+ -+ down_read(&mm->mmap_sem); -+ if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) { -+ up_read(&mm->mmap_sem); -+ return -ESRCH; -+ } -+ if (vma->vm_start != vmai->cpt_start) { -+ up_read(&mm->mmap_sem); -+ eprintk_ctx("vma start mismatch\n"); -+ return -EINVAL; -+ } -+ if (vma->vm_pgoff != vmai->cpt_pgoff) { -+ dprintk_ctx("vma pgoff mismatch, fixing\n"); -+ if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) { -+ eprintk_ctx("cannot fixup vma pgoff\n"); -+ up_read(&mm->mmap_sem); -+ return -EINVAL; -+ } -+ vma->vm_pgoff = vmai->cpt_pgoff; -+ } -+ -+ if (!vma->anon_vma) { -+ if (avma) { -+ vma->anon_vma = avma; -+ anon_vma_link(vma); -+ } else { -+ int err; -+ -+ err = anon_vma_prepare(vma); -+ -+ if (err) { -+ up_read(&mm->mmap_sem); -+ return err; -+ } -+ } -+ } else { -+ /* Note, we _can_ arrive to the situation, when two -+ * different anonvmaid's point to one anon_vma, this happens -+ * f.e. when mmap() merged new area to previous one and -+ * they will share one anon_vma even if they did not on -+ * original host. -+ * -+ * IT IS OK. To all that I understand, we may merge all -+ * the anon_vma's and rmap can scan all the huge list of vmas -+ * searching for page. It is just "suboptimal". -+ * -+ * Real disaster would happen, if vma already got an anon_vma -+ * with different id. It is very rare case, kernel does the -+ * best efforts to merge anon_vmas when some attributes are -+ * different. In this case we will fall to copying memory. -+ */ -+ if (avma && vma->anon_vma != avma) { -+ up_read(&mm->mmap_sem); -+ eprintk_ctx("anon_vma mismatch\n"); -+ return -ESRCH; -+ } -+ } -+ -+ new_avma = vma->anon_vma; -+ up_read(&mm->mmap_sem); -+ -+ if (!avma) { -+ struct anonvma_map *map; -+ -+ if (!new_avma) -+ return -EINVAL; -+ -+ if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL) -+ return -ENOMEM; -+ -+ map->id = vmai->cpt_anonvmaid; -+ map->avma = new_avma; -+ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); -+ hlist_add_head(&map->list, &ctx->anonvmas[h]); -+ } -+ return 0; -+} -+ -+static int copy_mm_pages(struct mm_struct *src, unsigned long start, -+ unsigned long end) -+{ -+ int err; -+ -+ for (; start < end; start += PAGE_SIZE) { -+ struct page *page; -+ struct page *spage; -+ void *maddr, *srcaddr; -+ -+ err = get_user_pages(current, current->mm, -+ start, 1, 1, 1, &page, NULL); -+ if (err == 0) -+ err = -EFAULT; -+ if (err < 0) -+ return err; -+ -+ err = get_user_pages(current, src, -+ start, 1, 0, 1, &spage, NULL); -+ -+ if (err == 0) -+ err = -EFAULT; -+ if (err < 0) { -+ page_cache_release(page); -+ return err; -+ } -+ -+ srcaddr = kmap(spage); -+ maddr = kmap(page); -+ memcpy(maddr, srcaddr, PAGE_SIZE); -+ set_page_dirty_lock(page); -+ kunmap(page); -+ kunmap(spage); -+ page_cache_release(page); -+ page_cache_release(spage); -+ } -+ return 0; -+} -+ -+static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx) -+{ -+ int err = 0; -+ unsigned long addr; -+ struct mm_struct *mm = current->mm; -+ struct vm_area_struct *vma; -+ struct file *file = NULL; -+ unsigned long prot; -+ int checked = 0; -+ -+ prot = make_prot(vmai); -+ -+ if (vmai->cpt_file != CPT_NULL) { -+ if (vmai->cpt_type == CPT_VMA_TYPE_0) { -+ file = rst_file(vmai->cpt_file, -1, ctx); -+ if (IS_ERR(file)) { -+ eprintk_ctx("do_rst_vma: rst_file: %Ld\n", vmai->cpt_file); -+ return PTR_ERR(file); -+ } -+ } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) { -+ file = rst_sysv_shm(vmai->cpt_file, ctx); -+ if (IS_ERR(file)) -+ return PTR_ERR(file); -+ } -+ } -+ -+ down_write(&mm->mmap_sem); -+ addr = do_mmap_pgoff(file, vmai->cpt_start, -+ vmai->cpt_end-vmai->cpt_start, -+ prot, make_flags(vmai), -+ vmai->cpt_pgoff); -+ -+ if (addr != vmai->cpt_start) { -+ up_write(&mm->mmap_sem); -+ -+ err = -EINVAL; -+ if (IS_ERR((void*)addr)) -+ err = addr; -+ goto out; -+ } -+ -+ vma = find_vma(mm, vmai->cpt_start); -+ if (vma == NULL) { -+ up_write(&mm->mmap_sem); -+ eprintk_ctx("cannot find mmapped vma\n"); -+ err = -ESRCH; -+ goto out; -+ } -+ -+ /* do_mmap_pgoff() can merge new area to previous one (not to the next, -+ * we mmap in order, the rest of mm is still unmapped). This can happen -+ * f.e. if flags are to be adjusted later, or if we had different -+ * anon_vma on two adjacent regions. Split it by brute force. */ -+ if (vma->vm_start != vmai->cpt_start) { -+ dprintk_ctx("vma %Ld merged, split\n", vmapos); -+ err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0); -+ if (err) { -+ up_write(&mm->mmap_sem); -+ eprintk_ctx("cannot split vma\n"); -+ goto out; -+ } -+ } -+ up_write(&mm->mmap_sem); -+ -+ if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) { -+ err = verify_create_anonvma(mm, vmai, ctx); -+ if (err) { -+ eprintk_ctx("cannot verify_create_anonvma\n"); -+ goto out; -+ } -+ } -+ -+ if (vmai->cpt_next > vmai->cpt_hdrlen) { -+ loff_t offset = vmapos + vmai->cpt_hdrlen; -+ -+ do { -+ union { -+ struct cpt_page_block pb; -+ struct cpt_remappage_block rpb; -+ struct cpt_copypage_block cpb; -+ struct cpt_lazypage_block lpb; -+ } u; -+ loff_t pos; -+ -+ err = rst_get_object(-1, offset, &u, ctx); -+ if (err) { -+ eprintk_ctx("vma fix object: %d\n", err); -+ goto out; -+ } -+ if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) { -+ err = sc_remap_file_pages(u.rpb.cpt_start, -+ u.rpb.cpt_end-u.rpb.cpt_start, -+ 0, u.rpb.cpt_pgoff, 0); -+ if (err < 0) { -+ eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err, -+ (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), -+ (__u32)u.rpb.cpt_pgoff); -+ goto out; -+ } -+ offset += u.rpb.cpt_next; -+ continue; -+ } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) { -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ unsigned long addr = u.lpb.cpt_start; -+ -+ down_read(&mm->mmap_sem); -+ if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { -+ eprintk_ctx("lost vm_area_struct\n"); -+ err = -ESRCH; -+ goto out; -+ } -+ err = anon_vma_prepare(vma); -+ if (err) { -+ up_read(&mm->mmap_sem); -+ goto out; -+ } -+ while (addr < u.lpb.cpt_end) { -+ err = rst_pagein(vma, u.lpb.cpt_index + (addr-u.lpb.cpt_start)/PAGE_SIZE, -+ addr, ctx); -+ if (err) -+ break; -+ addr += PAGE_SIZE; -+ } -+ up_read(&mm->mmap_sem); -+#else -+ err = -EINVAL; -+#endif -+ if (err) -+ goto out; -+ offset += u.cpb.cpt_next; -+ continue; -+ } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) { -+ struct vm_area_struct *vma, *vma1; -+ struct mm_struct *src; -+ struct anon_vma *src_anon; -+ cpt_object_t *mobj; -+ -+ if (!vmai->cpt_anonvmaid) { -+ err = -EINVAL; -+ eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n"); -+ goto out; -+ } -+ -+ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx); -+ if (!mobj) { -+ eprintk_ctx("lost mm_struct to clone pages from\n"); -+ err = -ESRCH; -+ goto out; -+ } -+ src = mobj->o_obj; -+ -+ down_read(&src->mmap_sem); -+ src_anon = NULL; -+ vma1 = find_vma(src, u.cpb.cpt_start); -+ if (vma1) -+ src_anon = vma1->anon_vma; -+ up_read(&src->mmap_sem); -+ -+ if (!vma1) { -+ eprintk_ctx("lost src vm_area_struct\n"); -+ err = -ESRCH; -+ goto out; -+ } -+ -+ down_read(&mm->mmap_sem); -+ if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) { -+ up_read(&mm->mmap_sem); -+ eprintk_ctx("lost vm_area_struct\n"); -+ err = -ESRCH; -+ goto out; -+ } -+ -+ if (!src_anon || -+ !vma->anon_vma || -+ vma->anon_vma != src_anon || -+ vma->vm_start - vma1->vm_start != -+ (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) { -+ up_read(&mm->mmap_sem); -+ wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos); -+ err = copy_mm_pages(mobj->o_obj, -+ u.cpb.cpt_start, -+ u.cpb.cpt_end); -+ } else { -+ err = __copy_page_range(vma, vma1, -+ u.cpb.cpt_start, -+ u.cpb.cpt_end-u.cpb.cpt_start); -+ up_read(&mm->mmap_sem); -+ } -+ if (err) { -+ eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err, -+ (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), -+ (long)u.cpb.cpt_source); -+ goto out; -+ } -+ -+ offset += u.cpb.cpt_next; -+ continue; -+ } -+ if (u.pb.cpt_object != CPT_OBJ_PAGES) { -+ eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object); -+ err = -EINVAL; -+ goto out; -+ } -+ pos = offset + sizeof(u.pb); -+ if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) { -+ /* I guess this is get_user_pages() messed things, -+ * this happens f.e. when gdb inserts breakpoints. -+ */ -+ int i; -+ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) { -+ struct page *page; -+ void *maddr; -+ err = get_user_pages(current, current->mm, -+ (unsigned long)u.pb.cpt_start + i*PAGE_SIZE, -+ 1, 1, 1, &page, NULL); -+ if (err == 0) -+ err = -EFAULT; -+ if (err < 0) { -+ eprintk_ctx("get_user_pages: %d\n", err); -+ goto out; -+ } -+ err = 0; -+ maddr = kmap(page); -+ if (u.pb.cpt_content == CPT_CONTENT_VOID) { -+ memset(maddr, 0, PAGE_SIZE); -+ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { -+ err = ctx->pread(maddr, PAGE_SIZE, -+ ctx, pos + i*PAGE_SIZE); -+ if (err) { -+ kunmap(page); -+ goto out; -+ } -+ } else { -+ err = -EINVAL; -+ kunmap(page); -+ goto out; -+ } -+ set_page_dirty_lock(page); -+ kunmap(page); -+ page_cache_release(page); -+ } -+ } else { -+ if (!(prot&PROT_WRITE)) -+ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); -+ if (u.pb.cpt_content == CPT_CONTENT_VOID) { -+ int i; -+ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) { -+ err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i); -+ if (err) { -+ eprintk_ctx("__put_user 2 %d\n", err); -+ goto out; -+ } -+ } -+ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { -+ loff_t tpos = pos; -+ err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start), -+ u.pb.cpt_end-u.pb.cpt_start, -+ &tpos); -+ if (err != u.pb.cpt_end-u.pb.cpt_start) { -+ if (err >= 0) -+ err = -EIO; -+ goto out; -+ } -+ } else { -+ err = -EINVAL; -+ goto out; -+ } -+ if (!(prot&PROT_WRITE)) -+ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); -+ } -+ err = 0; -+ offset += u.pb.cpt_next; -+ } while (offset < vmapos + vmai->cpt_next); -+ } -+ -+check: -+ do { -+ struct vm_area_struct *vma; -+ down_read(&mm->mmap_sem); -+ vma = find_vma(mm, addr); -+ if (vma) { -+ if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) { -+ VM_ClearReadHint(vma); -+ vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK; -+ } -+ if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) { -+ dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos); -+ up_read(&mm->mmap_sem); -+ if (vma->vm_flags&VM_LOCKED) -+ err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); -+ else -+ err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); -+ if (err) -+ goto out; -+ goto check; -+ } -+ if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX) -+ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, -+ (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); -+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -+ if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) && -+ (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE)) -+ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, -+ (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); -+#endif -+ if (vma->vm_flags != vmai->cpt_flags) { -+ unsigned long x = vma->vm_flags ^ vmai->cpt_flags; -+ if (x & VM_EXEC) { -+ /* Crap. On i386 this is OK. -+ * It is impossible to make via mmap/mprotect -+ * exec.c clears VM_EXEC on stack. */ -+ vma->vm_flags &= ~VM_EXEC; -+ } else if ((x & VM_ACCOUNT) && !checked) { -+ checked = 1; -+ if (!(prot&PROT_WRITE)) { -+ up_read(&mm->mmap_sem); -+ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); -+ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); -+ goto check; -+ } -+ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, -+ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); -+ } else { -+ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, -+ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); -+ } -+ } -+ } else { -+ wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos); -+ } -+ up_read(&mm->mmap_sem); -+ } while (0); -+ -+out: -+ if (file) -+ fput(file); -+ return err; -+} -+ -+static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx) -+{ -+ int err = 0; -+ unsigned int def_flags; -+ struct mm_struct *mm = current->mm; -+ -+ down_write(&mm->mmap_sem); -+ do_munmap(mm, 0, TASK_SIZE); -+ -+ mm->start_code = vmi->cpt_start_code; -+ mm->end_code = vmi->cpt_end_code; -+ mm->start_data = vmi->cpt_start_data; -+ mm->end_data = vmi->cpt_end_data; -+ mm->start_brk = vmi->cpt_start_brk; -+ mm->brk = vmi->cpt_brk; -+ mm->start_stack = vmi->cpt_start_stack; -+ mm->arg_start = vmi->cpt_start_arg; -+ mm->arg_end = vmi->cpt_end_arg; -+ mm->env_start = vmi->cpt_start_env; -+ mm->env_end = vmi->cpt_end_env; -+ mm->def_flags = 0; -+ def_flags = vmi->cpt_def_flags; -+ -+ mm->dumpable = (vmi->cpt_dumpable != 0); -+ mm->vps_dumpable = (vmi->cpt_vps_dumpable != 0); -+ -+#if 0 /* def CONFIG_HUGETLB_PAGE*/ -+/* NB: ? */ -+ int used_hugetlb; -+#endif -+ up_write(&mm->mmap_sem); -+ -+ if (vmi->cpt_next > vmi->cpt_hdrlen) { -+ loff_t offset = pos + vmi->cpt_hdrlen; -+ do { -+ union { -+ struct cpt_vma_image vmai; -+ struct cpt_aio_ctx_image aioi; -+ struct cpt_obj_bits bits; -+ } u; -+ err = rst_get_object(-1, offset, &u, ctx); -+ if (err) -+ goto out; -+ if (u.vmai.cpt_object == CPT_OBJ_VMA) { -+ err = do_rst_vma(&u.vmai, offset, pos, ctx); -+ if (err) -+ goto out; -+ } else if (u.bits.cpt_object == CPT_OBJ_BITS && -+ u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) { -+ err = do_rst_ldt(&u.bits, offset, ctx); -+ if (err) -+ goto out; -+ } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) { -+ err = do_rst_aio(&u.aioi, offset, ctx); -+ if (err) -+ goto out; -+ } else { -+ eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object); -+ err = -EINVAL; -+ goto out; -+ } -+ offset += u.vmai.cpt_next; -+ } while (offset < pos + vmi->cpt_next); -+ } -+ -+ down_write(&mm->mmap_sem); -+ mm->def_flags = def_flags; -+ up_write(&mm->mmap_sem); -+ -+ -+out: -+ return err; -+} -+ -+extern void exit_mm(struct task_struct * tsk); -+ -+int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ int err = 0; -+ cpt_object_t *mobj; -+ void *tmp = (void*)__get_free_page(GFP_KERNEL); -+ struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp; -+ -+ if (!tmp) -+ return -ENOMEM; -+ -+ if (ti->cpt_mm == CPT_NULL) { -+ if (current->mm) -+ exit_mm(current); -+ goto out; -+ } -+ -+ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); -+ if (mobj) { -+ if (current->mm != mobj->o_obj) BUG(); -+ goto out; -+ } -+ -+ if (current->mm == NULL) { -+ struct mm_struct *mm = mm_alloc(); -+ if (mm == NULL) { -+ err = -ENOMEM; -+ goto out; -+ } -+ err = init_new_context(current, mm); -+ if (err) { -+ mmdrop(mm); -+ goto out; -+ } -+ current->mm = mm; -+ } -+ -+ if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0) -+ goto out; -+ if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) { -+ eprintk_ctx("do_rst_mm %Ld\n", ti->cpt_mm); -+ goto out; -+ } -+ err = -ENOMEM; -+ mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx); -+ if (mobj != NULL) { -+ err = 0; -+ cpt_obj_setpos(mobj, ti->cpt_mm, ctx); -+ } -+ -+out: -+ if (tmp) -+ free_page((unsigned long)tmp); -+ return err; -+} -+ -+/* This is part of mm setup, made in parent context. Mostly, it is the place, -+ * where we graft mm of another process to child. -+ */ -+ -+int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ task_t *tsk = obj->o_obj; -+ cpt_object_t *mobj; -+ -+ /* Task without mm. Just get rid of this. */ -+ if (ti->cpt_mm == CPT_NULL) { -+ if (tsk->mm) { -+ mmput(tsk->mm); -+ tsk->mm = NULL; -+ } -+ return 0; -+ } -+ -+ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); -+ if (mobj) { -+ struct mm_struct *newmm = mobj->o_obj; -+ /* Good, the MM is already created. */ -+ if (newmm == tsk->mm) { -+ /* Already done by clone(). */ -+ return 0; -+ } -+ mmput(tsk->mm); -+ atomic_inc(&newmm->mm_users); -+ tsk->mm = newmm; -+ tsk->active_mm = newmm; -+ } -+ return 0; -+} -+ -+/* We use CLONE_VM when mm of child is going to be shared with parent. -+ * Otherwise mm is copied. -+ */ -+ -+__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ if (ti->cpt_mm == CPT_NULL || -+ lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx)) -+ return CLONE_VM; -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_net.c linux-2.6.16-026test009/kernel/cpt/rst_net.c ---- linux-2.6.16.orig/kernel/cpt/rst_net.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_net.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,444 @@ -+/* -+ * -+ * kernel/cpt/rst_net.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/fs.h> -+#include <linux/socket.h> -+#include <linux/netdevice.h> -+#include <linux/inetdevice.h> -+#include <linux/rtnetlink.h> -+#include <linux/ve.h> -+#include <linux/ve_proto.h> -+#include <net/route.h> -+#include <net/ip_fib.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_kernel.h" -+#include "cpt_net.h" -+ -+#include "cpt_syscalls.h" -+ -+extern struct in_ifaddr *inet_alloc_ifa(void); -+extern int inet_insert_ifa(struct in_ifaddr *ifa); -+ -+int rst_restore_ifaddr(struct cpt_context *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ struct cpt_ifaddr_image di; -+ struct net_device *dev; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ int cindex = -1; -+ int err; -+ err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx); -+ if (err) -+ return err; -+ if (di.cpt_index == ctx->lo_index_old) -+ cindex = ctx->lo_index; -+ else if (di.cpt_index == ctx->venet_index_old) -+ cindex = ctx->venet_index; -+ if (cindex <= 0) -+ eprintk_ctx("unknown ifaddr for %d\n", di.cpt_index); -+ rtnl_lock(); -+ dev = __dev_get_by_index(cindex); -+ if (dev && di.cpt_family == AF_INET) { -+ struct in_device *in_dev; -+ struct in_ifaddr *ifa; -+ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) -+ in_dev = inetdev_init(dev); -+ ifa = inet_alloc_ifa(); -+ if (ifa) { -+ ifa->ifa_local = di.cpt_address[0]; -+ ifa->ifa_address = di.cpt_peer[0]; -+ ifa->ifa_broadcast = di.cpt_broadcast[0]; -+ ifa->ifa_prefixlen = di.cpt_masklen; -+ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); -+ ifa->ifa_flags = di.cpt_flags; -+ ifa->ifa_scope = di.cpt_scope; -+ memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ); -+ in_dev_hold(in_dev); -+ ifa->ifa_dev = in_dev; -+ err = inet_insert_ifa(ifa); -+ if (err && err != -EEXIST) { -+ rtnl_unlock(); -+ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); -+ return err; -+ } -+ } -+ } else { -+ rtnl_unlock(); -+ eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index); -+ return -EINVAL; -+ } -+ rtnl_unlock(); -+ sec += di.cpt_next; -+ } -+ return 0; -+} -+ -+static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx) -+{ -+ int min_len = NLMSG_LENGTH(sizeof(struct rtmsg)); -+ struct rtmsg *rtm = NLMSG_DATA(nlh); -+ -+ if (nlh->nlmsg_len > min_len) { -+ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); -+ struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len); -+ -+ while (RTA_OK(rta, attrlen)) { -+ if (rta->rta_type == RTA_OIF) { -+ int idx = *(int*)RTA_DATA(rta); -+ if (idx == ctx->lo_index_old) -+ idx = ctx->lo_index; -+ else if (idx == ctx->venet_index_old) -+ idx = ctx->venet_index; -+ else { -+ eprintk_ctx("unknown iface %d\n", idx); -+ return -ENODEV; -+ } -+ *(int*)RTA_DATA(rta) = idx; -+ } -+ rta = RTA_NEXT(rta, attrlen); -+ } -+ } -+ return rtm->rtm_protocol == RTPROT_KERNEL; -+} -+ -+int rst_restore_route(struct cpt_context *ctx) -+{ -+ int err; -+ struct socket *sock; -+ struct msghdr msg; -+ struct iovec iov; -+ struct sockaddr_nl nladdr; -+ mm_segment_t oldfs; -+ loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ struct cpt_object_hdr v; -+ char *pg; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ if (h.cpt_hdrlen >= h.cpt_next) -+ return 0; -+ -+ sec += h.cpt_hdrlen; -+ err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx); -+ if (err < 0) -+ return err; -+ -+ err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); -+ if (err) -+ return err; -+ -+ pg = (char*)__get_free_page(GFP_KERNEL); -+ if (pg == NULL) { -+ err = -ENOMEM; -+ goto out_sock; -+ } -+ -+ memset(&nladdr, 0, sizeof(nladdr)); -+ nladdr.nl_family = AF_NETLINK; -+ -+ endsec = sec + v.cpt_next; -+ sec += v.cpt_hdrlen; -+ -+ while (sec < endsec) { -+ struct nlmsghdr *n; -+ struct nlmsghdr nh; -+ int kernel_flag; -+ -+ err = ctx->pread(&nh, sizeof(nh), ctx, sec); -+ if (err) -+ goto out_sock_pg; -+ if (nh.nlmsg_len > PAGE_SIZE) { -+ err = -EINVAL; -+ goto out_sock_pg; -+ } -+ err = ctx->pread(pg, nh.nlmsg_len, ctx, sec); -+ if (err) -+ goto out_sock_pg; -+ -+ n = (struct nlmsghdr*)pg; -+ n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE; -+ -+ err = rewrite_rtmsg(n, ctx); -+ if (err < 0) -+ goto out_sock_pg; -+ kernel_flag = err; -+ -+ iov.iov_base=n; -+ iov.iov_len=nh.nlmsg_len; -+ msg.msg_name=&nladdr; -+ msg.msg_namelen=sizeof(nladdr); -+ msg.msg_iov=&iov; -+ msg.msg_iovlen=1; -+ msg.msg_control=NULL; -+ msg.msg_controllen=0; -+ msg.msg_flags=MSG_DONTWAIT; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ err = sock_sendmsg(sock, &msg, nh.nlmsg_len); -+ set_fs(oldfs); -+ -+ if (err < 0) -+ goto out_sock_pg; -+ err = 0; -+ -+ iov.iov_base=pg; -+ iov.iov_len=PAGE_SIZE; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); -+ set_fs(oldfs); -+ if (err != -EAGAIN) { -+ if (err == NLMSG_LENGTH(sizeof(struct nlmsgerr)) && -+ n->nlmsg_type == NLMSG_ERROR) { -+ struct nlmsgerr *e = NLMSG_DATA(n); -+ if (e->error != -EEXIST || !kernel_flag) -+ eprintk_ctx("NLMERR: %d\n", e->error); -+ } else { -+ eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type); -+ } -+ } -+ err = 0; -+ sec += NLMSG_ALIGN(nh.nlmsg_len); -+ } -+ -+out_sock_pg: -+ free_page((unsigned long)pg); -+out_sock: -+ sock_release(sock); -+ return err; -+} -+ -+int rst_resume_network(struct cpt_context *ctx) -+{ -+ struct ve_struct *env; -+ -+ env = get_ve_by_id(ctx->ve_id); -+ if (!env) -+ return -ESRCH; -+ env->disable_net = 0; -+ put_ve(env); -+ return 0; -+} -+ -+int rst_restore_netdev(struct cpt_context *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ struct cpt_netdev_image di; -+ struct net_device *dev; -+ -+ get_exec_env()->disable_net = 1; -+ -+ dev = __dev_get_by_name("lo"); -+ if (!dev) { -+ eprintk_ctx("cannot find loopback netdevice\n"); -+ return -EINVAL; -+ } -+ ctx->lo_index = dev->ifindex; -+ ctx->lo_index_old = -1; -+ dev = __dev_get_by_name("venet0"); -+ if (!dev) { -+ eprintk_ctx("cannot find venet0 netdevice\n"); -+ return -EINVAL; -+ } -+ ctx->venet_index = dev->ifindex; -+ ctx->venet_index_old = -1; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ int err; -+ err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx); -+ if (err) -+ return err; -+ if (strcmp(di.cpt_name, "lo") == 0) { -+ ctx->lo_index_old = di.cpt_index; -+ } else if (strcmp(di.cpt_name, "venet0") == 0) { -+ ctx->venet_index_old = di.cpt_index; -+ } else { -+ eprintk_ctx("unknown interface %s\n", di.cpt_name); -+ } -+ dev = __dev_get_by_name(di.cpt_name); -+ if (dev) { -+ if (di.cpt_flags^dev->flags) { -+ rtnl_lock(); -+ err = dev_change_flags(dev, di.cpt_flags); -+ rtnl_unlock(); -+ if (err) -+ eprintk_ctx("dev_change_flags err: %d\n", err); -+ } -+ } else { -+ eprintk_ctx("unknown interface 2 %s\n", di.cpt_name); -+ } -+ sec += di.cpt_next; -+ } -+ return 0; -+} -+ -+static int dumpfn(void *arg) -+{ -+ int i; -+ int *pfd = arg; -+ char *argv[] = { "iptables-restore", "-c", NULL }; -+ -+ if (pfd[0] != 0) -+ sc_dup2(pfd[0], 0); -+ -+ for (i=1; i<current->files->fdt->max_fds; i++) -+ sc_close(i); -+ -+ module_put(THIS_MODULE); -+ -+ set_fs(KERNEL_DS); -+ i = sc_execve("/sbin/iptables-restore", argv, NULL); -+ eprintk("failed to exec /sbin/iptables-restore: %d\n", i); -+ return -1; -+} -+ -+static int rst_restore_iptables(struct cpt_context * ctx) -+{ -+ int err; -+ int pfd[2]; -+ struct file *f; -+ struct cpt_object_hdr v; -+ int n; -+ struct cpt_section_hdr h; -+ loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES]; -+ loff_t end; -+ int pid; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ if (h.cpt_hdrlen == h.cpt_next) -+ return 0; -+ if (h.cpt_hdrlen > h.cpt_next) -+ return -EINVAL; -+ sec += h.cpt_hdrlen; -+ err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx); -+ if (err < 0) -+ return err; -+ -+ err = sc_pipe(pfd); -+ if (err < 0) -+ return err; -+ pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); -+ if (err < 0) -+ goto out; -+ f = fget(pfd[1]); -+ sc_close(pfd[1]); -+ sc_close(pfd[0]); -+ -+ ctx->file->f_pos = sec + v.cpt_hdrlen; -+ end = sec + v.cpt_next; -+ do { -+ char *p; -+ char buf[16]; -+ mm_segment_t oldfs; -+ -+ n = end - ctx->file->f_pos; -+ if (n > sizeof(buf)) -+ n = sizeof(buf); -+ -+ if (ctx->read(buf, n, ctx)) -+ break; -+ if ((p = memchr(buf, 0, n)) != NULL) -+ n = p - buf; -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ f->f_op->write(f, buf, n, &f->f_pos); -+ set_fs(oldfs); -+ } while (ctx->file->f_pos < end); -+ -+ fput(f); -+ -+ clear_tsk_thread_flag(current,TIF_SIGPENDING); -+ -+ if ((err = sc_waitx(pid, 0)) < 0) -+ eprintk_ctx("wait4: %d\n", err); -+ -+ return 0; -+ -+out: -+ if (pfd[1] >= 0) -+ sc_close(pfd[1]); -+ if (pfd[0] >= 0) -+ sc_close(pfd[0]); -+ return err; -+} -+ -+int rst_restore_net(struct cpt_context *ctx) -+{ -+ int err; -+ -+ err = rst_restore_netdev(ctx); -+ if (!err) -+ err = rst_restore_ifaddr(ctx); -+ if (!err) -+ err = rst_restore_route(ctx); -+ if (!err) -+ err = rst_restore_iptables(ctx); -+ if (!err) -+ err = rst_restore_ip_conntrack(ctx); -+ return err; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_proc.c linux-2.6.16-026test009/kernel/cpt/rst_proc.c ---- linux-2.6.16.orig/kernel/cpt/rst_proc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_proc.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,606 @@ -+/* -+ * -+ * kernel/cpt/rst_proc.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/errno.h> -+#include <linux/mm.h> -+#include <linux/proc_fs.h> -+#include <linux/smp_lock.h> -+#include <asm/uaccess.h> -+#include <linux/cpt_ioctl.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_dump.h" -+#include "cpt_files.h" -+#include "cpt_mm.h" -+#include "cpt_kernel.h" -+ -+MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>"); -+MODULE_LICENSE("GPL"); -+ -+/* List of contexts and lock protecting the list */ -+struct list_head cpt_context_list; -+spinlock_t cpt_context_lock; -+ -+static int proc_read(char *buffer, char **start, off_t offset, -+ int length, int *eof, void *data) -+{ -+ off_t pos = 0; -+ off_t begin = 0; -+ int len = 0; -+ cpt_context_t *ctx; -+ -+ len += sprintf(buffer, "Ctx Id VE State\n"); -+ -+ spin_lock(&cpt_context_lock); -+ -+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { -+ len += sprintf(buffer+len,"%p %08x %-8u %d", -+ ctx, -+ ctx->contextid, -+ ctx->ve_id, -+ ctx->ctx_state -+ ); -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ len += pagein_info_printf(buffer+len, ctx); -+#endif -+ -+ buffer[len++] = '\n'; -+ -+ pos = begin+len; -+ if (pos < offset) { -+ len = 0; -+ begin = pos; -+ } -+ if (pos > offset+length) -+ goto done; -+ } -+ *eof = 1; -+ -+done: -+ spin_unlock(&cpt_context_lock); -+ *start = buffer + (offset - begin); -+ len -= (offset - begin); -+ if(len > length) -+ len = length; -+ if(len < 0) -+ len = 0; -+ return len; -+} -+ -+void rst_context_release(cpt_context_t *ctx) -+{ -+ list_del(&ctx->ctx_list); -+ spin_unlock(&cpt_context_lock); -+ -+ if (ctx->ctx_state > 0) -+ rst_resume(ctx); -+ ctx->ctx_state = CPT_CTX_ERROR; -+ -+ rst_close_dumpfile(ctx); -+ -+ if (ctx->anonvmas) { -+ int h; -+ for (h = 0; h < CPT_ANONVMA_HSIZE; h++) { -+ while (!hlist_empty(&ctx->anonvmas[h])) { -+ struct hlist_node *elem = ctx->anonvmas[h].first; -+ hlist_del(elem); -+ kfree(elem); -+ } -+ } -+ free_page((unsigned long)ctx->anonvmas); -+ } -+ cpt_flush_error(ctx); -+ if (ctx->errorfile) { -+ fput(ctx->errorfile); -+ ctx->errorfile = NULL; -+ } -+ if (ctx->error_msg) { -+ free_page((unsigned long)ctx->error_msg); -+ ctx->error_msg = NULL; -+ } -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ if (ctx->pagein_file_out) -+ fput(ctx->pagein_file_out); -+ if (ctx->pagein_file_in) -+ fput(ctx->pagein_file_in); -+ if (ctx->pgin_task) -+ put_task_struct(ctx->pgin_task); -+#endif -+ if (ctx->filejob_queue) -+ rst_flush_filejobs(ctx); -+ if (ctx->objcount) -+ eprintk_ctx("%d objects leaked\n", ctx->objcount); -+ kfree(ctx); -+ -+ spin_lock(&cpt_context_lock); -+} -+ -+static void __cpt_context_put(cpt_context_t *ctx) -+{ -+ if (!--ctx->refcount) -+ rst_context_release(ctx); -+} -+ -+static void cpt_context_put(cpt_context_t *ctx) -+{ -+ spin_lock(&cpt_context_lock); -+ __cpt_context_put(ctx); -+ spin_unlock(&cpt_context_lock); -+} -+ -+cpt_context_t * rst_context_open(void) -+{ -+ cpt_context_t *ctx; -+ -+ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { -+ rst_context_init(ctx); -+ spin_lock(&cpt_context_lock); -+ list_add_tail(&ctx->ctx_list, &cpt_context_list); -+ spin_unlock(&cpt_context_lock); -+ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); -+ if (ctx->error_msg != NULL) -+ ctx->error_msg[0] = 0; -+ } -+ return ctx; -+} -+ -+void rst_report_error(int err, cpt_context_t *ctx) -+{ -+ if (ctx->statusfile) { -+ mm_segment_t oldfs; -+ int status = 7 /* VZ_ENVCREATE_ERROR */; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ if (ctx->statusfile->f_op && ctx->statusfile->f_op->write) -+ ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos); -+ set_fs(oldfs); -+ fput(ctx->statusfile); -+ ctx->statusfile = NULL; -+ } -+} -+ -+ -+cpt_context_t * cpt_context_lookup(unsigned int ctxid) -+{ -+ cpt_context_t *ctx; -+ -+ spin_lock(&cpt_context_lock); -+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { -+ if (ctx->contextid == ctxid) { -+ ctx->refcount++; -+ spin_unlock(&cpt_context_lock); -+ return ctx; -+ } -+ } -+ spin_unlock(&cpt_context_lock); -+ return NULL; -+} -+ -+static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) -+{ -+ int err = 0; -+ cpt_context_t *ctx; -+ struct file *dfile = NULL; -+ -+ unlock_kernel(); -+ -+ if (cmd == CPT_TEST_CAPS) { -+ err = test_cpu_caps(); -+ goto out_lock; -+ } -+ -+ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { -+ cpt_context_t *old_ctx; -+ -+ ctx = NULL; -+ if (cmd == CPT_JOIN_CONTEXT) { -+ err = -ENOENT; -+ ctx = cpt_context_lookup(arg); -+ if (!ctx) -+ goto out_lock; -+ } -+ -+ spin_lock(&cpt_context_lock); -+ old_ctx = (cpt_context_t*)file->private_data; -+ file->private_data = ctx; -+ -+ if (old_ctx) { -+ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { -+ old_ctx->sticky = 0; -+ old_ctx->refcount--; -+ } -+ __cpt_context_put(old_ctx); -+ } -+ spin_unlock(&cpt_context_lock); -+ err = 0; -+ goto out_lock; -+ } -+ -+ spin_lock(&cpt_context_lock); -+ ctx = (cpt_context_t*)file->private_data; -+ if (ctx) -+ ctx->refcount++; -+ spin_unlock(&cpt_context_lock); -+ -+ if (!ctx) { -+ cpt_context_t *old_ctx; -+ -+ err = -ENOMEM; -+ ctx = rst_context_open(); -+ if (!ctx) -+ goto out_lock; -+ -+ spin_lock(&cpt_context_lock); -+ old_ctx = (cpt_context_t*)file->private_data; -+ if (!old_ctx) { -+ ctx->refcount++; -+ file->private_data = ctx; -+ } else { -+ old_ctx->refcount++; -+ } -+ if (old_ctx) { -+ __cpt_context_put(ctx); -+ ctx = old_ctx; -+ } -+ spin_unlock(&cpt_context_lock); -+ } -+ -+ if (cmd == CPT_GET_CONTEXT) { -+ unsigned int contextid = (unsigned int)arg; -+ -+ err = -EINVAL; -+ if (ctx->contextid && ctx->contextid != contextid) -+ goto out_nosem; -+ if (!ctx->contextid) { -+ cpt_context_t *c1 = cpt_context_lookup(contextid); -+ if (c1) { -+ cpt_context_put(c1); -+ err = -EEXIST; -+ goto out_nosem; -+ } -+ ctx->contextid = contextid; -+ } -+ spin_lock(&cpt_context_lock); -+ if (!ctx->sticky) { -+ ctx->sticky = 1; -+ ctx->refcount++; -+ } -+ spin_unlock(&cpt_context_lock); -+ err = 0; -+ goto out_nosem; -+ } -+ -+ down(&ctx->main_sem); -+ -+ err = -EBUSY; -+ if (ctx->ctx_state < 0) -+ goto out; -+ -+ err = 0; -+ switch (cmd) { -+ case CPT_SET_DUMPFD: -+ if (ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ if (dfile->f_op == NULL || -+ dfile->f_op->read == NULL) { -+ fput(dfile); -+ err = -EBADF; -+ break; -+ } -+ } -+ if (ctx->file) -+ fput(ctx->file); -+ ctx->file = dfile; -+ break; -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ case CPT_SET_PAGEINFDIN: -+ if (ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ } -+ if (ctx->pagein_file_in) -+ fput(ctx->pagein_file_in); -+ ctx->pagein_file_in = dfile; -+ break; -+ case CPT_SET_PAGEINFDOUT: -+ if (ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ } -+ if (ctx->pagein_file_out) -+ fput(ctx->pagein_file_out); -+ ctx->pagein_file_out = dfile; -+ break; -+ case CPT_PAGEIND: -+ err = rst_pageind(ctx); -+ break; -+#endif -+ case CPT_SET_LOCKFD: -+ if (ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ } -+ if (ctx->lockfile) -+ fput(ctx->lockfile); -+ ctx->lockfile = dfile; -+ break; -+ case CPT_SET_STATUSFD: -+ if (ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ } -+ if (ctx->statusfile) -+ fput(ctx->statusfile); -+ ctx->statusfile = dfile; -+ break; -+ case CPT_SET_ERRORFD: -+ if (arg >= 0) { -+ dfile = fget(arg); -+ if (IS_ERR(dfile)) { -+ err = PTR_ERR(dfile); -+ break; -+ } -+ } -+ if (ctx->errorfile) -+ fput(ctx->errorfile); -+ ctx->errorfile = dfile; -+ break; -+ case CPT_SET_VEID: -+ if (ctx->ctx_state > 0) { -+ err = -EBUSY; -+ break; -+ } -+ ctx->ve_id = arg; -+ break; -+ case CPT_UNDUMP: -+ if (ctx->ctx_state > 0) { -+ err = -ENOENT; -+ break; -+ } -+ ctx->ctx_state = CPT_CTX_UNDUMPING; -+ err = vps_rst_undump(ctx); -+ if (err) { -+ rst_report_error(err, ctx); -+ if (rst_kill(ctx) == 0) -+ ctx->ctx_state = CPT_CTX_IDLE; -+ } else { -+ ctx->ctx_state = CPT_CTX_UNDUMPED; -+ } -+ break; -+ case CPT_RESUME: -+ if (!ctx->ctx_state) { -+ err = -ENOENT; -+ break; -+ } -+ err = rst_resume(ctx); -+ if (!err) -+ ctx->ctx_state = CPT_CTX_IDLE; -+ break; -+ case CPT_KILL: -+ if (!ctx->ctx_state) { -+ err = -ENOENT; -+ break; -+ } -+ err = rst_kill(ctx); -+ if (!err) -+ ctx->ctx_state = CPT_CTX_IDLE; -+ break; -+ default: -+ err = -EINVAL; -+ break; -+ } -+ -+out: -+ cpt_flush_error(ctx); -+ up(&ctx->main_sem); -+out_nosem: -+ cpt_context_put(ctx); -+out_lock: -+ lock_kernel(); -+ return err; -+} -+ -+static int rst_open(struct inode * inode, struct file * file) -+{ -+ if (!try_module_get(THIS_MODULE)) -+ return -EBUSY; -+ -+ return 0; -+} -+ -+static int rst_release(struct inode * inode, struct file * file) -+{ -+ cpt_context_t *ctx; -+ -+ spin_lock(&cpt_context_lock); -+ ctx = (cpt_context_t*)file->private_data; -+ file->private_data = NULL; -+ if (ctx) -+ __cpt_context_put(ctx); -+ spin_unlock(&cpt_context_lock); -+ -+ -+ module_put(THIS_MODULE); -+ return 0; -+} -+ -+static struct file_operations rst_fops = -+{ -+ .owner = THIS_MODULE, -+ .ioctl = rst_ioctl, -+ .open = rst_open, -+ .release = rst_release, -+}; -+ -+ -+static struct proc_dir_entry *proc_ent; -+extern void *schedule_tail_p; -+extern void schedule_tail_hook(void); -+ -+int debug_level = 1; -+ -+static struct ctl_table_header *ctl_header; -+ -+static ctl_table debug_table[] = { -+ { -+ .ctl_name = 9476, -+ .procname = "rst", -+ .data = &debug_level, -+ .maxlen = sizeof(debug_level), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { .ctl_name = 0 } -+}; -+static ctl_table root_table[] = { -+ { -+ .ctl_name = CTL_DEBUG, -+ .procname = "debug", -+ .mode = 0555, -+ .child = debug_table, -+ }, -+ { .ctl_name = 0 } -+}; -+ -+#ifdef CONFIG_X86_64 -+ -+static void *vzentry_forkret_get(void) -+{ -+ unsigned char *p; -+ -+ p = (unsigned char *)ret_from_fork; -+ return (void *)(*(u32 *)(p + 1) + p + 5); -+} -+ -+static void vzentry_forkret_set(void *data) -+{ -+ unsigned char *p; -+ long offset; -+ -+ p = (unsigned char *)ret_from_fork; -+ offset = (unsigned long)data - (unsigned long)(p + 5); -+ if ((long)(s32)offset != offset) { -+ printk("vzentry_forkret_set: too long hook offset\n"); -+ BUG(); -+ } -+ *(u32 *)(p + 1) = offset; -+} -+#endif -+ -+static int __init init_rst(void) -+{ -+ int err; -+ -+ err = -ENOMEM; -+ ctl_header = register_sysctl_table(root_table, 0); -+ if (!ctl_header) -+ goto err_mon; -+ -+ spin_lock_init(&cpt_context_lock); -+ INIT_LIST_HEAD(&cpt_context_list); -+ -+ err = -EINVAL; -+ proc_ent = create_proc_entry("rst", 0600, NULL); -+ if (!proc_ent) -+ goto err_out; -+ -+ rst_fops.read = proc_ent->proc_fops->read; -+ rst_fops.write = proc_ent->proc_fops->write; -+ rst_fops.llseek = proc_ent->proc_fops->llseek; -+ proc_ent->proc_fops = &rst_fops; -+ -+ proc_ent->read_proc = proc_read; -+ proc_ent->data = NULL; -+ proc_ent->owner = THIS_MODULE; -+#ifdef CONFIG_X86_64 -+ schedule_tail_p = vzentry_forkret_get(); -+ vzentry_forkret_set(&schedule_tail_hook); -+#endif -+ return 0; -+ -+err_out: -+ unregister_sysctl_table(ctl_header); -+err_mon: -+ return err; -+} -+module_init(init_rst); -+ -+static void __exit exit_rst(void) -+{ -+#ifdef CONFIG_X86_64 -+ /* This is wrong, of course. But still the best what we can do. */ -+ vzentry_forkret_set(schedule_tail_p); -+#endif -+ -+ remove_proc_entry("rst", NULL); -+ unregister_sysctl_table(ctl_header); -+ -+ spin_lock(&cpt_context_lock); -+ while (!list_empty(&cpt_context_list)) { -+ cpt_context_t *ctx; -+ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); -+ -+ if (!ctx->sticky) -+ ctx->refcount++; -+ ctx->sticky = 0; -+ -+ BUG_ON(ctx->refcount != 1); -+ -+ __cpt_context_put(ctx); -+ } -+ spin_unlock(&cpt_context_lock); -+} -+module_exit(exit_rst); -diff -upr linux-2.6.16.orig/kernel/cpt/rst_process.c linux-2.6.16-026test009/kernel/cpt/rst_process.c ---- linux-2.6.16.orig/kernel/cpt/rst_process.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_process.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,1254 @@ -+/* -+ * -+ * kernel/cpt/rst_process.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/virtinfo.h> -+#include <linux/kmem_cache.h> -+#include <linux/errno.h> -+#include <linux/pagemap.h> -+#include <linux/ptrace.h> -+#include <linux/tty.h> -+#include <asm/desc.h> -+#include <asm/unistd.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_misc.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_files.h" -+#include "cpt_mm.h" -+#include "cpt_ubc.h" -+#include "cpt_process.h" -+#include "cpt_kernel.h" -+ -+#ifdef CONFIG_X86_64 -+ -+#define _TIF_RESUME (1<<22) -+ -+#define SYSCALL_NR(regs) ((regs)->orig_rax) -+#define SYSCALL_RETVAL(regs) ((regs)->rax) -+#define SYSCALL_PC(regs) ((regs)->rip) -+ -+#define ESP(tsk) (tsk)->thread.rsp -+ -+#define __NR32_restart_syscall 0 -+#define __NR32_rt_sigtimedwait 177 -+#define __NR32_pause 29 -+#define __NR32_futex 240 -+ -+#define syscall_is(tsk,regs,name) ((!((tsk)->thread_info->flags&_TIF_IA32) && \ -+ SYSCALL_NR(regs) == __NR_##name) || \ -+ (((tsk)->thread_info->flags&_TIF_IA32) && \ -+ SYSCALL_NR(regs) == __NR32_##name)) -+#else -+ -+#define SYSCALL_NR(regs) ((regs)->orig_eax) -+#define SYSCALL_RETVAL(regs) ((regs)->eax) -+#define SYSCALL_PC(regs) ((regs)->eip) -+ -+#define ESP(tsk) (tsk)->thread.esp -+ -+#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) -+ -+#undef task_pt_regs -+#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1) -+ -+#endif -+ -+static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si) -+{ -+ memset(info, 0, sizeof(*info)); -+ switch(si->cpt_code & __SI_MASK) { -+ case __SI_TIMER: -+ info->si_tid = si->cpt_pid; -+ info->si_overrun = si->cpt_uid; -+ info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval); -+ info->si_sys_private = si->cpt_utime; -+ break; -+ case __SI_POLL: -+ info->si_band = si->cpt_pid; -+ info->si_fd = si->cpt_uid; -+ break; -+ case __SI_FAULT: -+ info->si_addr = cpt_ptr_import(si->cpt_sigval); -+#ifdef __ARCH_SI_TRAPNO -+ info->si_trapno = si->cpt_pid; -+#endif -+ break; -+ case __SI_CHLD: -+ info->si_pid = si->cpt_pid; -+ info->si_uid = si->cpt_uid; -+ info->si_status = si->cpt_sigval; -+ info->si_stime = si->cpt_stime; -+ info->si_utime = si->cpt_utime; -+ break; -+ case __SI_KILL: -+ case __SI_RT: -+ case __SI_MESGQ: -+ default: -+ info->si_pid = si->cpt_pid; -+ info->si_uid = si->cpt_uid; -+ info->si_ptr = cpt_ptr_import(si->cpt_sigval); -+ break; -+ } -+ info->si_signo = si->cpt_signo; -+ info->si_errno = si->cpt_errno; -+ info->si_code = si->cpt_code; -+} -+ -+static int restore_sigqueue(task_t *tsk, -+ struct sigpending *queue, unsigned long start, -+ unsigned long end) -+{ -+ while (start < end) { -+ struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start; -+ if (si->cpt_object == CPT_OBJ_SIGINFO) { -+ struct user_beancounter *ub; -+ struct sigqueue *q = NULL; -+ struct user_struct *up; -+ up = alloc_uid(si->cpt_user); -+ if (!up) -+ return -ENOMEM; -+ q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); -+ if (!q) { -+ free_uid(up); -+ return -ENOMEM; -+ } -+ ub = get_beancounter(get_exec_ub()); -+ if (ub_siginfo_charge(q, ub)) { -+ put_beancounter(ub); -+ kmem_cache_free(sigqueue_cachep, q); -+ free_uid(up); -+ return -ENOMEM; -+ } -+ -+ INIT_LIST_HEAD(&q->list); -+ /* Preallocated elements (posix timers) are not -+ * supported yet. It is safe to replace them with -+ * a private one. */ -+ q->flags = 0; -+ q->user = up; -+ atomic_inc(&q->user->sigpending); -+ q->sig_ub = ub; -+ -+ decode_siginfo(&q->info, si); -+ list_add_tail(&q->list, &queue->list); -+ } -+ start += si->cpt_next; -+ } -+ return 0; -+} -+ -+int rst_process_linkage(cpt_context_t *ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ struct cpt_task_image *ti = obj->o_image; -+ -+ if (tsk == NULL) { -+ eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm); -+ return -EINVAL; -+ } -+ -+ if (virt_pgid(tsk) != ti->cpt_pgrp) { -+ int pid; -+ -+ if ((pid = vpid_to_pid(ti->cpt_pgrp)) < 0) { -+ eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk)); -+ return -EINVAL; -+ } -+ -+ write_lock_irq(&tasklist_lock); -+ detach_pid(tsk, PIDTYPE_PGID); -+ tsk->signal->pgrp = pid; -+ set_virt_pgid(tsk, ti->cpt_pgrp); -+ if (thread_group_leader(tsk)) -+ attach_pid(tsk, PIDTYPE_PGID, pid); -+ write_unlock_irq(&tasklist_lock); -+ } -+ if (virt_sid(tsk) != ti->cpt_session) { -+ int pid; -+ -+ if ((pid = vpid_to_pid(ti->cpt_session)) < 0) { -+ eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk)); -+ return -EINVAL; -+ } -+ -+ write_lock_irq(&tasklist_lock); -+ detach_pid(tsk, PIDTYPE_SID); -+ tsk->signal->session = pid; -+ set_virt_sid(tsk, ti->cpt_session); -+ if (thread_group_leader(tsk)) -+ attach_pid(tsk, PIDTYPE_SID, pid); -+ write_unlock_irq(&tasklist_lock); -+ } -+ if (ti->cpt_old_pgrp > 0 && tsk->signal->tty_old_pgrp == 0) { -+ int pid; -+ -+ if ((pid = vpid_to_pid(ti->cpt_old_pgrp)) < 0) { -+ eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk)); -+ return -EINVAL; -+ } -+ -+ tsk->signal->tty_old_pgrp = pid; -+ } -+ } -+ -+ return 0; -+} -+ -+static int restore_one_signal_struct(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_signal_image *si = cpt_get_buf(ctx); -+ -+ current->signal->tty = NULL; -+ -+ err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ -+ if (virt_pgid(current) != si->cpt_pgrp) { -+ int err; -+ int pid = 0; -+ -+ if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) { -+ pid = alloc_pidmap(); -+ if (pid < 0) { -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ if ((err = alloc_vpid(pid, si->cpt_pgrp)) < 0) { -+ free_pidmap(pid); -+ pid = 0; -+ if (err != -EEXIST) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ } -+ } -+ if (pid || -+ (pid = vpid_to_pid(si->cpt_pgrp)) > 0) { -+ write_lock_irq(&tasklist_lock); -+ detach_pid(current, PIDTYPE_PGID); -+ current->signal->pgrp = pid; -+ set_virt_pgid(current, si->cpt_pgrp); -+ if (thread_group_leader(current)) -+ attach_pid(current, PIDTYPE_PGID, pid); -+ write_unlock_irq(&tasklist_lock); -+ } -+ } -+ -+ current->signal->tty_old_pgrp = 0; -+ if ((int)si->cpt_old_pgrp > 0) { -+ if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) { -+ current->signal->tty_old_pgrp = alloc_pidmap(); -+ if (current->signal->tty_old_pgrp < 0) { -+ eprintk_ctx("failed to allocate stray tty_old_pgrp\n"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ free_pidmap(current->signal->tty_old_pgrp); -+ } else { -+ current->signal->tty_old_pgrp = vpid_to_pid(si->cpt_old_pgrp); -+ if (current->signal->tty_old_pgrp < 0) { -+ dprintk_ctx("forward old tty PGID\n"); -+ current->signal->tty_old_pgrp = 0; -+ } -+ } -+ } -+ -+ if (virt_sid(current) != si->cpt_session) { -+ int err; -+ int pid = 0; -+ -+ if (si->cpt_session_type == CPT_PGRP_ORPHAN) { -+ pid = alloc_pidmap(); -+ if (pid < 0) { -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ if ((err = alloc_vpid(pid, si->cpt_session)) < 0) { -+ free_pidmap(pid); -+ pid = 0; -+ if (err != -EEXIST) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ } -+ } -+ if (pid || -+ (pid = vpid_to_pid(si->cpt_session)) > 0) { -+ write_lock_irq(&tasklist_lock); -+ detach_pid(current, PIDTYPE_SID); -+ set_virt_sid(current, si->cpt_session); -+ current->signal->session = pid; -+ if (thread_group_leader(current)) -+ attach_pid(current, PIDTYPE_SID, pid); -+ write_unlock_irq(&tasklist_lock); -+ } -+ } -+ -+ cpt_sigset_import(¤t->signal->shared_pending.signal, si->cpt_sigpending); -+ current->signal->leader = si->cpt_leader; -+ if (si->cpt_ctty != CPT_NULL) { -+ cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx); -+ if (obj) { -+ struct tty_struct *tty = obj->o_obj; -+ if (tty->session == 0 || tty->session == current->signal->session) { -+ tty->session = current->signal->session; -+ current->signal->tty = tty; -+ } else { -+ wprintk_ctx("tty session mismatch\n"); -+ } -+ } -+ } -+ -+ if (si->cpt_curr_target) -+ current->signal->curr_target = find_task_by_pid_ve(si->cpt_curr_target); -+ current->signal->flags = 0; -+ if (si->cpt_group_exit) -+ current->signal->flags |= SIGNAL_GROUP_EXIT; -+ current->signal->group_exit_code = si->cpt_group_exit_code; -+ if (si->cpt_group_exit_task) { -+ current->signal->group_exit_task = find_task_by_pid_ve(si->cpt_group_exit_task); -+ if (current->signal->group_exit_task == NULL) { -+ eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ } -+ current->signal->notify_count = si->cpt_notify_count; -+ current->signal->group_stop_count = si->cpt_group_stop_count; -+ -+ if (si->cpt_next > si->cpt_hdrlen) { -+ char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL); -+ if (buf == NULL) { -+ cpt_release_buf(ctx); -+ return -ENOMEM; -+ } -+ err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx, -+ ti->cpt_signal + si->cpt_hdrlen); -+ if (err) { -+ kfree(buf); -+ cpt_release_buf(ctx); -+ return err; -+ } -+ restore_sigqueue(current, -+ ¤t->signal->shared_pending, (unsigned long)buf, -+ (unsigned long)buf + si->cpt_next - si->cpt_hdrlen); -+ kfree(buf); -+ } -+ cpt_release_buf(ctx); -+ return 0; -+} -+ -+int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_sighand_image si; -+ int i; -+ loff_t pos, endpos; -+ -+ err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx); -+ if (err) -+ return err; -+ -+ for (i=0; i<_NSIG; i++) { -+ current->sighand->action[i].sa.sa_handler = SIG_DFL; -+ current->sighand->action[i].sa.sa_restorer = 0; -+ current->sighand->action[i].sa.sa_flags = SA_ONESHOT | SA_NOMASK; -+ memset(¤t->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t)); -+ } -+ -+ pos = ti->cpt_sighand + si.cpt_hdrlen; -+ endpos = ti->cpt_sighand + si.cpt_next; -+ while (pos < endpos) { -+ struct cpt_sighandler_image shi; -+ -+ err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx); -+ if (err) -+ return err; -+ current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler; -+ current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer; -+ current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags; -+ cpt_sigset_import(¤t->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask); -+ pos += shi.cpt_next; -+ } -+ -+ return 0; -+} -+ -+ -+__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ __u32 flag = 0; -+ -+ if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx)) -+ flag |= CLONE_THREAD; -+ if (ti->cpt_sighand == CPT_NULL || -+ lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx)) -+ flag |= CLONE_SIGHAND; -+ return flag; -+} -+ -+int rst_signal_complete(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ int err; -+ cpt_object_t *obj; -+ -+ if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) { -+ return -EINVAL; -+ } -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx); -+ if (obj) { -+ struct sighand_struct *sig = current->sighand; -+ if (obj->o_obj != sig) { -+ return -EINVAL; -+ } -+ } else { -+ obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx); -+ if (obj == NULL) -+ return -ENOMEM; -+ cpt_obj_setpos(obj, ti->cpt_sighand, ctx); -+ err = restore_one_sighand_struct(ti, ctx); -+ if (err) -+ return err; -+ } -+ -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx); -+ if (obj) { -+ struct signal_struct *sig = current->signal; -+ if (obj->o_obj != sig) { -+ return -EINVAL; -+ } -+ if (current->signal) { -+ set_virt_pgid(current, pid_type_to_vpid(PIDTYPE_PGID, current->signal->pgrp)); -+ set_virt_sid(current, pid_type_to_vpid(PIDTYPE_SID, current->signal->session)); -+ } -+ } else { -+ obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx); -+ if (obj == NULL) -+ return -ENOMEM; -+ cpt_obj_setpos(obj, ti->cpt_signal, ctx); -+ err = restore_one_signal_struct(ti, ctx); -+ if (err) -+ return err; -+ } -+ -+ return 0; -+} -+ -+static u32 decode_segment(u32 segid) -+{ -+ if (segid == CPT_SEG_ZERO) -+ return 0; -+ -+ /* TLS descriptors */ -+ if (segid <= CPT_SEG_TLS3) -+ return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3; -+ -+ /* LDT descriptor, it is just an index to LDT array */ -+ if (segid >= CPT_SEG_LDT) -+ return ((segid - CPT_SEG_LDT) << 3) | 7; -+ -+ /* Check for one of standard descriptors */ -+#ifdef CONFIG_X86_64 -+ if (segid == CPT_SEG_USER32_DS) -+ return __USER32_DS; -+ if (segid == CPT_SEG_USER32_CS) -+ return __USER32_CS; -+ if (segid == CPT_SEG_USER64_DS) -+ return __USER_DS; -+ if (segid == CPT_SEG_USER64_CS) -+ return __USER_CS; -+#else -+ if (segid == CPT_SEG_USER32_DS) -+ return __USER_DS; -+ if (segid == CPT_SEG_USER32_CS) -+ return __USER_CS; -+#endif -+ wprintk("Invalid segment reg %d\n", segid); -+ return 0; -+} -+ -+unsigned long rct(unsigned long *child_tids) -+{ -+ dprintk("rct: " CPT_FID "\n", CPT_TID(current)); -+ current->clear_child_tid = (void*)child_tids[0]; -+ current->set_child_tid = (void*)child_tids[1]; -+ module_put(THIS_MODULE); -+ return (unsigned long)(child_tids+2); -+} -+ -+unsigned long rlsi(void) -+{ -+ int signr; -+ siginfo_t *info = current->last_siginfo; -+ struct pt_regs *regs = task_pt_regs(current); -+ struct k_sigaction *ka; -+ int ptrace_id; -+ -+ dprintk("rlsi: " CPT_FID "\n", CPT_TID(current)); -+ -+ spin_lock_irq(¤t->sighand->siglock); -+ current->last_siginfo = NULL; -+ recalc_sigpending(); -+ -+ ptrace_id = current->pn_state; -+ clear_pn_state(current); -+ -+ switch (ptrace_id) { -+ case PN_STOP_TF: -+ case PN_STOP_TF_RT: -+ /* frame_*signal */ -+ dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %lu %lu\n", -+ virt_pid(current), current->pid, current->comm, -+ info->si_signo, info->si_code, -+ current->exit_code, SYSCALL_NR(regs), -+ current->ptrace, current->ptrace_message); -+ goto out; -+ case PN_STOP_ENTRY: -+ case PN_STOP_LEAVE: -+ /* do_syscall_trace */ -+ spin_unlock_irq(¤t->sighand->siglock); -+ dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code); -+ if (current->exit_code) { -+ send_sig(current->exit_code, current, 1); -+ current->exit_code = 0; -+ } -+ if (ptrace_id == PN_STOP_ENTRY && SYSCALL_RETVAL(regs) == -ENOSYS) { -+ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); -+ SYSCALL_PC(regs) -= 2; -+ } else if (syscall_is(current, regs, rt_sigtimedwait)) { -+ if (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR) { -+ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); -+ SYSCALL_PC(regs) -= 2; -+ } -+ } -+ goto out_nolock; -+ case PN_STOP_FORK: -+ /* fork */ -+ SYSCALL_RETVAL(regs) = current->ptrace_message; -+ dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs)); -+ goto out; -+ case PN_STOP_VFORK: -+ /* after vfork */ -+ SYSCALL_RETVAL(regs) = current->ptrace_message; -+ dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs)); -+ goto out; -+ case PN_STOP_SIGNAL: -+ /* normal case : dequeue signal */ -+ break; -+ case PN_STOP_EXIT: -+ dprintk("ptrace exit caught\n"); -+ current->ptrace &= ~PT_TRACE_EXIT; -+ spin_unlock_irq(¤t->sighand->siglock); -+ module_put(THIS_MODULE); -+ complete_and_exit(NULL, current->ptrace_message); -+ BUG(); -+ case PN_STOP_EXEC: -+ eprintk("ptrace after exec caught: must not happen\n"); -+ BUG(); -+ default: -+ eprintk("ptrace with unknown identity %d\n", ptrace_id); -+ BUG(); -+ } -+ -+ signr = current->exit_code; -+ if (signr == 0) { -+ dprintk("rlsi: canceled signal %d\n", info->si_signo); -+ goto out; -+ } -+ current->exit_code = 0; -+ -+ if (signr != info->si_signo) { -+ info->si_signo = signr; -+ info->si_errno = 0; -+ info->si_code = SI_USER; -+ info->si_pid = virt_pid(current->parent); -+ info->si_uid = current->parent->uid; -+ } -+ -+ /* If the (new) signal is now blocked, requeue it. */ -+ if (sigismember(¤t->blocked, signr)) { -+ dprintk("going to requeue signal %d\n", signr); -+ goto out_resend_sig; -+ } -+ -+ ka = ¤t->sighand->action[signr-1]; -+ if (ka->sa.sa_handler == SIG_IGN) { -+ dprintk("going to resend signal %d (ignored)\n", signr); -+ goto out; -+ } -+ if (ka->sa.sa_handler != SIG_DFL) { -+ dprintk("going to resend signal %d (not SIG_DFL)\n", signr); -+ goto out_resend_sig; -+ } -+ if (signr == SIGCONT || -+ signr == SIGCHLD || -+ signr == SIGWINCH || -+ signr == SIGURG || -+ current->pid == 1) -+ goto out; -+ -+ /* All the rest, which we cannot handle are requeued. */ -+ dprintk("going to resend signal %d (sigh)\n", signr); -+out_resend_sig: -+ spin_unlock_irq(¤t->sighand->siglock); -+ send_sig_info(signr, info, current); -+ module_put(THIS_MODULE); -+ return (unsigned long)(info+1); -+ -+out: -+ spin_unlock_irq(¤t->sighand->siglock); -+out_nolock: -+ module_put(THIS_MODULE); -+ return (unsigned long)(info+1); -+} -+ -+static void ret_finish_stop(void) -+{ -+ /* ... -+ * do_signal() -> -+ * get_signal_to_deliver() -> -+ * do_signal_stop() -> -+ * finish_stop() -+ * -+ * Normally after SIGCONT it will dequeue the next signal. If no signal -+ * is found, do_signal restarts syscall unconditionally. -+ * Otherwise signal handler is pushed on user stack. -+ */ -+ -+ dprintk("rfs: " CPT_FID "\n", CPT_TID(current)); -+ -+ clear_stop_state(current); -+ current->exit_code = 0; -+ -+ module_put(THIS_MODULE); -+} -+ -+static void ret_restart_sys(void) -+{ -+ struct pt_regs *regs = task_pt_regs(current); -+ -+ /* This hook is supposed to be executed, when we have -+ * to complete some interrupted syscall. -+ */ -+ dprintk("rrs: " CPT_FID "\n", CPT_TID(current)); -+ -+ if (syscall_is(current,regs,pause)) { -+ if (SYSCALL_RETVAL(regs) == -ERESTARTNOHAND) { -+ current->state = TASK_INTERRUPTIBLE; -+ schedule(); -+ } -+ } else if (syscall_is(current,regs,rt_sigtimedwait)) { -+ if (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR) { -+ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); -+ SYSCALL_PC(regs) -= 2; -+ } -+ } else if (syscall_is(current,regs,futex)) { -+ if (SYSCALL_RETVAL(regs) == -EINTR) { -+ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); -+ SYSCALL_PC(regs) -= 2; -+ } -+ } -+ -+ if (!signal_pending(current)) { -+ if (SYSCALL_RETVAL(regs) == -ERESTARTSYS || -+ SYSCALL_RETVAL(regs) == -ERESTARTNOINTR || -+ SYSCALL_RETVAL(regs) == -ERESTARTNOHAND) { -+ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); -+ SYSCALL_PC(regs) -= 2; -+ } else if (SYSCALL_RETVAL(regs) == -ERESTART_RESTARTBLOCK) { -+ SYSCALL_RETVAL(regs) = __NR_restart_syscall; -+#ifdef CONFIG_X86_64 -+ if (current->thread_info->flags&_TIF_IA32) -+ SYSCALL_RETVAL(regs) = __NR32_restart_syscall; -+#endif -+ SYSCALL_PC(regs) -= 2; -+ } -+ } -+ -+ module_put(THIS_MODULE); -+} -+ -+extern void ret_last_siginfo(void); -+extern void ret_child_tid(void); -+extern void ret_from_rst(void); -+extern void pre_ret_from_fork(void); -+ -+#ifndef CONFIG_X86_64 -+ -+/* tsk->thread.eip points to pre_ret_from_fork -+ * Stack layout: -+ * [eip of the last hook] -+ * [args of the last hook] -+ * [eip of previous hook] -+ * [args of previous hook] -+ * ... -+ * [eip of the first hook] -+ * [args of the first hook] -+ * [ret_from_rst] -+ */ -+ -+static void * add_hook(task_t *tsk, void (*hook)(void), int argsize, int *hooks) -+{ -+ ESP(tsk) -= sizeof(unsigned long); -+ *(unsigned long*)ESP(tsk) = tsk->thread.eip; -+ ESP(tsk) -= argsize; -+ tsk->thread.eip = (unsigned long)hook; -+ if (!try_module_get(THIS_MODULE)) BUG(); -+ (*hooks)++; -+ return (void*)ESP(tsk); -+} -+ -+static int restore_registers(task_t *tsk, struct pt_regs *regs, -+ struct cpt_task_image *ti, struct cpt_x86_regs *b) -+{ -+ if (b->cpt_object != CPT_OBJ_X86_REGS) -+ return -EINVAL; -+ -+ tsk->thread.esp = (unsigned long) regs; -+ tsk->thread.esp0 = (unsigned long) (regs+1); -+ tsk->thread.eip = (unsigned long) ret_from_rst; -+ -+ tsk->thread.fs = decode_segment(b->cpt_fs); -+ tsk->thread.gs = decode_segment(b->cpt_gs); -+ tsk->thread.debugreg[0] = b->cpt_debugreg[0]; -+ tsk->thread.debugreg[1] = b->cpt_debugreg[1]; -+ tsk->thread.debugreg[2] = b->cpt_debugreg[2]; -+ tsk->thread.debugreg[3] = b->cpt_debugreg[3]; -+ tsk->thread.debugreg[4] = b->cpt_debugreg[4]; -+ tsk->thread.debugreg[5] = b->cpt_debugreg[5]; -+ tsk->thread.debugreg[6] = b->cpt_debugreg[6]; -+ tsk->thread.debugreg[7] = b->cpt_debugreg[7]; -+ -+ memcpy(regs, &b->cpt_ebx, sizeof(struct pt_regs)); -+ -+ regs->xcs = decode_segment(b->cpt_xcs); -+ regs->xss = decode_segment(b->cpt_xss); -+ regs->xds = decode_segment(b->cpt_xds); -+ regs->xes = decode_segment(b->cpt_xes); -+ -+ return 0; -+} -+ -+#else -+ -+/* Stack layout: -+ * -+ * [eip of the last hook] -+ * [args of the last hook] -+ * ... -+ * [eip of the first hook] -+ * [args of the first hook] -+ * [ret_from_fork+5] -+ */ -+ -+static void * add_hook(task_t *tsk, void (*hook)(void), int argsize, int *hooks) -+{ -+ if (!*hooks) { -+ extern void ret_from_fork2(void); -+ ESP(tsk) -= sizeof(unsigned long); -+ *(unsigned long*)ESP(tsk) = (unsigned long)ret_from_fork2; -+ tsk->thread_info->flags |= _TIF_RESUME; -+ } -+ ESP(tsk) -= argsize + sizeof(unsigned long); -+ *(unsigned long*)ESP(tsk) = (unsigned long)hook; -+ if (!try_module_get(THIS_MODULE)) BUG(); -+ (*hooks)++; -+ return (void*)(ESP(tsk) + sizeof(unsigned long)); -+} -+ -+static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s) -+{ -+ memset(d, 0, sizeof(struct pt_regs)); -+ d->rbp = s->cpt_ebp; -+ d->rbx = s->cpt_ebx; -+ d->rax = (s32)s->cpt_eax; -+ d->rcx = s->cpt_ecx; -+ d->rdx = s->cpt_edx; -+ d->rsi = s->cpt_esi; -+ d->rdi = s->cpt_edi; -+ d->orig_rax = (s32)s->cpt_orig_eax; -+ d->rip = s->cpt_eip; -+ d->cs = s->cpt_xcs; -+ d->eflags = s->cpt_eflags; -+ d->rsp = s->cpt_esp; -+ d->ss = s->cpt_xss; -+} -+ -+static int restore_registers(task_t *tsk, struct pt_regs *regs, -+ struct cpt_task_image *ti, struct cpt_obj_bits *hdr) -+{ -+ if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) { -+ struct cpt_x86_64_regs *b = (void*)hdr; -+ -+ tsk->thread.rsp = (unsigned long) regs; -+ tsk->thread.rsp0 = (unsigned long) (regs+1); -+ -+ tsk->thread.fs = b->cpt_fsbase; -+ tsk->thread.gs = b->cpt_gsbase; -+ tsk->thread.fsindex = decode_segment(b->cpt_fsindex); -+ tsk->thread.gsindex = decode_segment(b->cpt_gsindex); -+ tsk->thread.ds = decode_segment(b->cpt_ds); -+ tsk->thread.es = decode_segment(b->cpt_es); -+ tsk->thread.debugreg0 = b->cpt_debugreg[0]; -+ tsk->thread.debugreg1 = b->cpt_debugreg[1]; -+ tsk->thread.debugreg2 = b->cpt_debugreg[2]; -+ tsk->thread.debugreg3 = b->cpt_debugreg[3]; -+ tsk->thread.debugreg6 = b->cpt_debugreg[6]; -+ tsk->thread.debugreg7 = b->cpt_debugreg[7]; -+ -+ memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs)); -+ -+ tsk->thread.userrsp = regs->rsp; -+ regs->cs = decode_segment(b->cpt_cs); -+ regs->ss = decode_segment(b->cpt_ss); -+ } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) { -+ struct cpt_x86_regs *b = (void*)hdr; -+ -+ tsk->thread.rsp = (unsigned long) regs; -+ tsk->thread.rsp0 = (unsigned long) (regs+1); -+ -+ tsk->thread.fs = 0; -+ tsk->thread.gs = 0; -+ tsk->thread.fsindex = decode_segment(b->cpt_fs); -+ tsk->thread.gsindex = decode_segment(b->cpt_gs); -+ tsk->thread.debugreg0 = b->cpt_debugreg[0]; -+ tsk->thread.debugreg1 = b->cpt_debugreg[1]; -+ tsk->thread.debugreg2 = b->cpt_debugreg[2]; -+ tsk->thread.debugreg3 = b->cpt_debugreg[3]; -+ tsk->thread.debugreg6 = b->cpt_debugreg[6]; -+ tsk->thread.debugreg7 = b->cpt_debugreg[7]; -+ -+ xlate_ptregs_32_to_64(regs, b); -+ -+ tsk->thread.userrsp = regs->rsp; -+ regs->cs = decode_segment(b->cpt_xcs); -+ regs->ss = decode_segment(b->cpt_xss); -+ tsk->thread.ds = decode_segment(b->cpt_xds); -+ tsk->thread.es = decode_segment(b->cpt_xes); -+ } else { -+ return -EINVAL; -+ } -+ return 0; -+} -+ -+#endif -+ -+int rst_restore_process(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ struct cpt_task_image *ti = obj->o_image; -+ struct pt_regs * regs; -+ struct cpt_object_hdr *b; -+ struct cpt_siginfo_image *lsi = NULL; -+ struct group_info *gids, *ogids; -+ int hooks = 0; -+ int i; -+ -+ if (tsk == NULL) { -+ eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm); -+ return -EFAULT; -+ } -+ -+ wait_task_inactive(tsk); -+ regs = task_pt_regs(tsk); -+ -+ if (!tsk->exit_state) { -+ tsk->lock_depth = -1; -+#ifdef CONFIG_PREEMPT -+ tsk->thread_info->preempt_count--; -+#endif -+ } -+ -+ if (tsk->static_prio != ti->cpt_static_prio) -+ set_user_nice(tsk, PRIO_TO_NICE(ti->cpt_static_prio)); -+ -+ cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked); -+ cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked); -+ cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked); -+ cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending); -+ -+ tsk->uid = ti->cpt_uid; -+ tsk->euid = ti->cpt_euid; -+ tsk->suid = ti->cpt_suid; -+ tsk->fsuid = ti->cpt_fsuid; -+ tsk->gid = ti->cpt_gid; -+ tsk->egid = ti->cpt_egid; -+ tsk->sgid = ti->cpt_sgid; -+ tsk->fsgid = ti->cpt_fsgid; -+ memcpy(&tsk->cap_effective, &ti->cpt_ecap, sizeof(tsk->cap_effective)); -+ memcpy(&tsk->cap_inheritable, &ti->cpt_icap, sizeof(tsk->cap_inheritable)); -+ memcpy(&tsk->cap_permitted, &ti->cpt_pcap, sizeof(tsk->cap_permitted)); -+ tsk->keep_capabilities = (ti->cpt_keepcap != 0); -+ tsk->did_exec = (ti->cpt_did_exec != 0); -+ gids = groups_alloc(ti->cpt_ngids); -+ ogids = tsk->group_info; -+ if (gids) { -+ int i; -+ for (i=0; i<32; i++) -+ gids->small_block[i] = ti->cpt_gids[i]; -+ tsk->group_info = gids; -+ } -+ if (ogids) -+ put_group_info(ogids); -+ tsk->utime = ti->cpt_utime; -+ tsk->stime = ti->cpt_stime; -+ if (ctx->image_version == 0) { -+ tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC); -+ } else { -+ cpt_timespec_import(&tsk->start_time, ti->cpt_starttime); -+ } -+ _set_normalized_timespec(&tsk->start_time, -+ tsk->start_time.tv_sec - -+ get_exec_env()->init_entry->start_time.tv_sec, -+ tsk->start_time.tv_nsec - -+ get_exec_env()->init_entry->start_time.tv_nsec); -+ -+ tsk->nvcsw = ti->cpt_nvcsw; -+ tsk->nivcsw = ti->cpt_nivcsw; -+ tsk->min_flt = ti->cpt_min_flt; -+ tsk->maj_flt = ti->cpt_maj_flt; -+ -+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) -+ tsk->cutime = ti->cpt_cutime; -+ tsk->cstime = ti->cpt_cstime; -+ tsk->cnvcsw = ti->cpt_cnvcsw; -+ tsk->cnivcsw = ti->cpt_cnivcsw; -+ tsk->cmin_flt = ti->cpt_cmin_flt; -+ tsk->cmaj_flt = ti->cpt_cmaj_flt; -+ -+ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) -+ __asm__("undefined\n"); -+ -+ for (i=0; i<RLIM_NLIMITS; i++) { -+ tsk->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; -+ tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i]; -+ } -+#else -+ if (thread_group_leader(tsk) && tsk->signal) { -+ tsk->signal->utime = ti->cpt_utime; -+ tsk->signal->stime = ti->cpt_stime; -+ tsk->signal->cutime = ti->cpt_cutime; -+ tsk->signal->cstime = ti->cpt_cstime; -+ tsk->signal->nvcsw = ti->cpt_nvcsw; -+ tsk->signal->nivcsw = ti->cpt_nivcsw; -+ tsk->signal->cnvcsw = ti->cpt_cnvcsw; -+ tsk->signal->cnivcsw = ti->cpt_cnivcsw; -+ tsk->signal->min_flt = ti->cpt_min_flt; -+ tsk->signal->maj_flt = ti->cpt_maj_flt; -+ tsk->signal->cmin_flt = ti->cpt_cmin_flt; -+ tsk->signal->cmaj_flt = ti->cpt_cmaj_flt; -+ -+ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) -+ __asm__("undefined\n"); -+ -+ for (i=0; i<RLIM_NLIMITS; i++) { -+ tsk->signal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; -+ tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i]; -+ } -+ } -+#endif -+ -+ for (i=0; i<3; i++) { -+ if (i >= GDT_ENTRY_TLS_ENTRIES) { -+ eprintk_ctx("too many tls descs\n"); -+ } else { -+#ifndef CONFIG_X86_64 -+ tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF; -+ tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32; -+#else -+ tsk->thread.tls_array[i] = ti->cpt_tls[i]; -+#endif -+ } -+ } -+ -+ clear_stopped_child_used_math(tsk); -+ -+ b = (void *)(ti+1); -+ while ((void*)b < ((void*)ti) + ti->cpt_next) { -+ /* Siginfo objects are at the end of obj array */ -+ if (b->cpt_object == CPT_OBJ_SIGINFO) { -+ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); -+ restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next); -+ set_exec_env(env); -+ break; -+ } -+ -+ switch (b->cpt_object) { -+ case CPT_OBJ_BITS: -+ if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE && -+ cpu_has_fxsr) { -+ memcpy(&tsk->thread.i387, -+ (void*)b + b->cpt_hdrlen, -+ sizeof(struct i387_fxsave_struct)); -+ if (ti->cpt_used_math) -+ set_stopped_child_used_math(tsk); -+ } -+#ifdef CONFIG_X86_32 -+ else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD && -+ !cpu_has_fxsr) { -+ memcpy(&tsk->thread.i387, -+ (void*)b + b->cpt_hdrlen, -+ sizeof(struct i387_fsave_struct)); -+ if (ti->cpt_used_math) -+ set_stopped_child_used_math(tsk); -+ } -+#endif -+ break; -+ case CPT_OBJ_LASTSIGINFO: -+ lsi = (void*)b; -+ break; -+ case CPT_OBJ_X86_REGS: -+ case CPT_OBJ_X86_64_REGS: -+ if (restore_registers(tsk, regs, ti, (void*)b)) { -+ eprintk_ctx("cannot restore registers: image is corrupted\n"); -+ return -EINVAL; -+ } -+ break; -+ } -+ b = ((void*)b) + b->cpt_next; -+ } -+ -+ if (ti->cpt_ppid != ti->cpt_rppid) { -+ task_t *parent; -+ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); -+ write_lock_irq(&tasklist_lock); -+ parent = find_task_by_pid_ve(ti->cpt_ppid); -+ if (parent && parent != tsk->parent) { -+ list_add(&tsk->ptrace_list, &tsk->parent->ptrace_children); -+ REMOVE_LINKS(tsk); -+ tsk->parent = parent; -+ SET_LINKS(tsk); -+ } -+ write_unlock_irq(&tasklist_lock); -+ set_exec_env(env); -+ } -+ -+ tsk->ptrace_message = ti->cpt_ptrace_message; -+ tsk->pn_state = ti->cpt_pn_state; -+ tsk->stopped_state = ti->cpt_stopped_state; -+ tsk->thread_info->flags = ti->cpt_thrflags; -+ -+ /* The image was created with kernel < 2.6.16, while -+ * task hanged in sigsuspend -> do_signal. -+ * -+ * FIXME! This needs more brain efforts... -+ */ -+ if (ti->cpt_sigsuspend_state) { -+ tsk->thread_info->flags |= _TIF_RESTORE_SIGMASK; -+ } -+ -+#ifdef CONFIG_X86_64 -+ tsk->thread_info->flags |= _TIF_FORK; -+ if (!ti->cpt_64bit) -+ tsk->thread_info->flags |= _TIF_IA32; -+#endif -+ -+#ifndef CONFIG_X86_64 -+ do { -+ if (regs->orig_eax == __NR__newselect && regs->edi) { -+ struct timeval tv; -+ if (access_process_vm(tsk, regs->edi, &tv, -+ sizeof(tv), 0) != sizeof(tv)) { -+ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n", -+ virt_pid(tsk), tsk->pid, tsk->comm, -+ regs->edi); -+ break; -+ } -+ dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n", -+ virt_pid(tsk), tsk->pid, tsk->comm, -+ tv.tv_sec, tv.tv_usec); -+ tv.tv_sec -= ctx->delta_time.tv_sec; -+ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { -+ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; -+ tv.tv_sec--; -+ } else { -+ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; -+ } -+ if (tv.tv_sec < 0) { -+ tv.tv_sec = 0; -+ tv.tv_usec = 0; -+ } -+ dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n", -+ virt_pid(tsk), tsk->pid, tsk->comm, -+ tv.tv_sec, tv.tv_usec); -+ if (access_process_vm(tsk, regs->edi, &tv, -+ sizeof(tv), 1) != sizeof(tv)) { -+ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n", -+ virt_pid(tsk), tsk->pid, tsk->comm, regs->edi); -+ } -+ -+ } else if (regs->orig_eax == __NR_select && regs->edi) { -+ struct { -+ unsigned long n; -+ fd_set __user *inp, *outp, *exp; -+ struct timeval __user *tvp; -+ } a; -+ struct timeval tv; -+ if (access_process_vm(tsk, regs->ebx, &a, -+ sizeof(a), 0) != sizeof(a)) { -+ wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid); -+ break; -+ } -+ if (access_process_vm(tsk, (unsigned long)a.tvp, -+ &tv, sizeof(tv), 0) != sizeof(tv)) { -+ wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid); -+ break; -+ } -+ dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n", -+ tsk->pid, tv.tv_sec, tv.tv_usec); -+ tv.tv_sec -= ctx->delta_time.tv_sec; -+ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { -+ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; -+ tv.tv_sec--; -+ } else { -+ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; -+ } -+ if (tv.tv_sec < 0) { -+ tv.tv_sec = 0; -+ tv.tv_usec = 0; -+ } -+ dprintk_ctx("task %d: New timeval in select: %ld.%ld\n", -+ tsk->pid, tv.tv_sec, tv.tv_usec); -+ if (access_process_vm(tsk, (unsigned long)a.tvp, -+ &tv, sizeof(tv), 1) != sizeof(tv)) { -+ wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid); -+ } -+ } -+ } while (0); -+#endif -+ -+ if (!tsk->exit_state && (long)SYSCALL_NR(regs) >= 0) { -+ if (SYSCALL_RETVAL(regs) == -ERESTARTSYS || -+ SYSCALL_RETVAL(regs) == -ERESTARTNOINTR || -+ SYSCALL_RETVAL(regs) == -ERESTARTNOHAND || -+ SYSCALL_RETVAL(regs) == -ERESTART_RESTARTBLOCK || -+ syscall_is(tsk,regs,pause) || -+ (syscall_is(tsk,regs,rt_sigtimedwait) && -+ (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR)) || -+ (syscall_is(tsk,regs,futex) && -+ (SYSCALL_RETVAL(regs) == -EINTR))) -+ add_hook(tsk, ret_restart_sys, 0, &hooks); -+ } -+ -+ if (lsi || tsk->pn_state) { -+ /* ... -> ptrace_notify() -+ * or -+ * ... -> do_signal() -> get_signal_to_deliver() -> -+ * ptrace stop -+ */ -+ tsk->last_siginfo = add_hook(tsk, ret_last_siginfo, sizeof(siginfo_t), &hooks); -+ memset(tsk->last_siginfo, 0, sizeof(siginfo_t)); -+ if (lsi) -+ decode_siginfo(tsk->last_siginfo, lsi); -+ } -+ -+ tsk->ptrace = ti->cpt_ptrace; -+ tsk->flags = ti->cpt_flags & ~PF_FROZEN; -+ clear_tsk_thread_flag(tsk, TIF_FREEZE); -+ tsk->exit_signal = ti->cpt_exit_signal; -+ -+ if (tsk->stopped_state) { -+ dprintk_ctx("finish_stop\n"); -+ if (ti->cpt_state != TASK_STOPPED) -+ eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state); -+ add_hook(tsk, ret_finish_stop, 0, &hooks); -+ } -+ -+ if (!tsk->exit_state && -+ (ti->cpt_set_tid || ti->cpt_clear_tid)) { -+ unsigned long *ptr = add_hook(tsk, ret_child_tid, sizeof(unsigned long)*2, &hooks); -+ ptr[0] = ti->cpt_clear_tid; -+ ptr[1] = ti->cpt_set_tid; -+ dprintk_ctx("settids\n"); -+ } -+ -+#ifdef CONFIG_X86_64 -+ if (!hooks && (long)SYSCALL_NR(regs) < 0) { -+ extern void ret_from_fork2(void); -+ ESP(tsk) -= sizeof(unsigned long); -+ *(unsigned long*)ESP(tsk) = (unsigned long)ret_from_fork2; -+ tsk->thread_info->flags |= _TIF_RESUME; -+ } -+#else -+ tsk->thread.esp -= 4; -+ *(__u32*)tsk->thread.esp = tsk->thread.eip; -+ tsk->thread.eip = (unsigned long)pre_ret_from_fork; -+#endif -+ -+ if (ti->cpt_state == TASK_TRACED) -+ tsk->state = TASK_TRACED; -+ else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) { -+ tsk->signal->it_virt_expires = 0; -+ tsk->signal->it_prof_expires = 0; -+ if (tsk->state != EXIT_DEAD) -+ eprintk_ctx("oops, schedule() did not make us dead\n"); -+ } -+ -+ if (thread_group_leader(tsk) && -+ ti->cpt_it_real_value && -+ !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { -+ DEFINE_KTIME(val); -+ -+ if (ctx->image_version != 0) { -+ ktime_t delta; -+ -+ val = ktime_add_ns(val, ti->cpt_it_real_value); -+ delta = timespec_to_ktime(ctx->delta_time); -+ val = ktime_sub(val, delta); -+ if (val.tv64 <= 0) -+ val.tv64 = NSEC_PER_USEC; -+ dprintk("rst itimer " CPT_FID " +%Ld %Ld %Lu\n", CPT_TID(tsk), val.tv64, delta.tv64, ti->cpt_it_real_value); -+ } else { -+ unsigned long jif = ti->cpt_it_real_value - -+ timespec_to_jiffies(&ctx->delta_time); -+ if ((long)jif <= 0) -+ jif = 1; -+ val = ktime_add_ns(val, (u64)jif*TICK_NSEC); -+ } -+ spin_lock_irq(&tsk->sighand->siglock); -+ if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) { -+ /* FIXME. Check!!!! */ -+ hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_REL); -+ } else { -+ wprintk_ctx("Timer clash. Impossible?\n"); -+ } -+ spin_unlock_irq(&tsk->sighand->siglock); -+ -+ dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk), val.tv64); -+ } -+ -+ module_put(THIS_MODULE); -+ } -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_socket.c linux-2.6.16-026test009/kernel/cpt/rst_socket.c ---- linux-2.6.16.orig/kernel/cpt/rst_socket.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_socket.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,797 @@ -+/* -+ * -+ * kernel/cpt/rst_socket.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/fs.h> -+#include <linux/namei.h> -+#include <linux/socket.h> -+#include <linux/un.h> -+#include <net/tcp.h> -+#include <net/sock.h> -+#include <net/scm.h> -+#include <net/af_unix.h> -+ -+#include <ub/ub_mem.h> -+#include <ub/ub_orphan.h> -+#include <ub/ub_orphan.h> -+#include <ub/ub_net.h> -+#include <ub/ub_tcp.h> -+ -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_files.h" -+#include "cpt_socket.h" -+#include "cpt_kernel.h" -+ -+#include "cpt_syscalls.h" -+ -+ -+static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si, -+ loff_t pos, struct cpt_context *ctx) -+{ -+ if (sk->sk_socket) { -+ sk->sk_socket->flags = si->cpt_ssflags; -+ sk->sk_socket->state = si->cpt_sstate; -+ } -+ sk->sk_reuse = si->cpt_reuse; -+ sk->sk_shutdown = si->cpt_shutdown; -+ sk->sk_userlocks = si->cpt_userlocks; -+ sk->sk_no_check = si->cpt_no_check; -+ sock_reset_flag(sk, SOCK_DBG); -+ if (si->cpt_debug) -+ sock_set_flag(sk, SOCK_DBG); -+ sock_reset_flag(sk, SOCK_RCVTSTAMP); -+ if (si->cpt_rcvtstamp) -+ sock_set_flag(sk, SOCK_RCVTSTAMP); -+ sock_reset_flag(sk, SOCK_LOCALROUTE); -+ if (si->cpt_localroute) -+ sock_set_flag(sk, SOCK_LOCALROUTE); -+ sk->sk_protocol = si->cpt_protocol; -+ sk->sk_err = si->cpt_err; -+ sk->sk_err_soft = si->cpt_err_soft; -+ sk->sk_priority = si->cpt_priority; -+ sk->sk_rcvlowat = si->cpt_rcvlowat; -+ sk->sk_rcvtimeo = si->cpt_rcvtimeo; -+ if (si->cpt_rcvtimeo == CPT_NULL) -+ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; -+ sk->sk_sndtimeo = si->cpt_sndtimeo; -+ if (si->cpt_sndtimeo == CPT_NULL) -+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; -+ sk->sk_rcvbuf = si->cpt_rcvbuf; -+ sk->sk_sndbuf = si->cpt_sndbuf; -+ sk->sk_bound_dev_if = si->cpt_bound_dev_if; -+ sk->sk_flags = si->cpt_flags; -+ sk->sk_lingertime = si->cpt_lingertime; -+ if (si->cpt_lingertime == CPT_NULL) -+ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; -+ sk->sk_peercred.pid = si->cpt_peer_pid; -+ sk->sk_peercred.uid = si->cpt_peer_uid; -+ sk->sk_peercred.gid = si->cpt_peer_gid; -+ cpt_timeval_import(&sk->sk_stamp, si->cpt_stamp); -+ return 0; -+} -+ -+static struct file *sock_mapfile(struct socket *sock) -+{ -+ int fd = sock_map_fd(sock); -+ -+ if (fd >= 0) { -+ struct file *file = sock->file; -+ get_file(file); -+ sc_close(fd); -+ return file; -+ } -+ return ERR_PTR(fd); -+} -+ -+/* Assumption is that /tmp exists and writable. -+ * In previous versions we assumed that listen() will autobind -+ * the socket. It does not do this for AF_UNIX by evident reason: -+ * socket in abstract namespace is accessible, unlike socket bound -+ * to deleted FS object. -+ */ -+ -+static int -+select_deleted_name(char * name, cpt_context_t *ctx) -+{ -+ int i; -+ -+ for (i=0; i<100; i++) { -+ struct nameidata nd; -+ unsigned int rnd = net_random(); -+ -+ sprintf(name, "/tmp/SOCK.%08x", rnd); -+ -+ if (path_lookup(name, 0, &nd) != 0) -+ return 0; -+ -+ path_release(&nd); -+ } -+ -+ eprintk_ctx("failed to allocate deleted socket inode\n"); -+ return -ELOOP; -+} -+ -+static int -+bind_unix_socket(struct socket *sock, struct cpt_sock_image *si, -+ cpt_context_t *ctx) -+{ -+ int err; -+ char *name; -+ struct sockaddr* addr; -+ int addrlen; -+ struct sockaddr_un sun; -+ struct nameidata nd; -+ -+ if ((addrlen = si->cpt_laddrlen) <= 2) -+ return 0; -+ -+ nd.dentry = NULL; -+ name = ((char*)si->cpt_laddr) + 2; -+ addr = (struct sockaddr *)si->cpt_laddr; -+ -+ if (name[0]) { -+ err = path_lookup(name, 0, &nd); -+ if (err) { -+ nd.dentry = NULL; -+ } else { -+ if (si->cpt_deleted) { -+ path_release(&nd); -+ nd.dentry = NULL; -+ addr = (struct sockaddr*)&sun; -+ addr->sa_family = AF_UNIX; -+ name = ((char*)addr) + 2; -+ err = select_deleted_name(name, ctx); -+ if (err) -+ return err; -+ addrlen = 2 + strlen(name); -+ } else if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) { -+ eprintk_ctx("bind_unix_socket: not a socket dentry\n"); -+ path_release(&nd); -+ return -EINVAL; -+ } -+ } -+ if (nd.dentry) -+ sc_unlink(name); -+ } -+ -+ err = sock->ops->bind(sock, addr, addrlen); -+ -+ if (!err) { -+ if (nd.dentry) { -+ sc_chown(name, nd.dentry->d_inode->i_uid, -+ nd.dentry->d_inode->i_gid); -+ sc_chmod(name, nd.dentry->d_inode->i_mode); -+ } -+ if (si->cpt_deleted && name[0]) -+ sc_unlink(name); -+ } -+ if (nd.dentry) -+ path_release(&nd); -+ return err; -+} -+ -+static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si, -+ struct cpt_context *ctx) -+{ -+ struct sock *sk = sock->sk; -+ cpt_object_t *obj; -+ struct sock *parent; -+ -+ if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN) -+ return 0; -+ -+ if (si->cpt_parent == -1) -+ return bind_unix_socket(sock, si, ctx); -+ -+ obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); -+ if (!obj) -+ return 0; -+ -+ parent = obj->o_obj; -+ if (unix_sk(parent)->addr) { -+ if (unix_sk(sk)->addr && -+ atomic_dec_and_test(&unix_sk(sk)->addr->refcnt)) -+ kfree(unix_sk(sk)->addr); -+ atomic_inc(&unix_sk(parent)->addr->refcnt); -+ unix_sk(sk)->addr = unix_sk(parent)->addr; -+ } -+ return 0; -+} -+ -+ -+static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si, -+ struct cpt_context *ctx) -+{ -+ int err; -+ struct socket *sock; -+ struct socket *sock2 = NULL; -+ struct file *file; -+ cpt_object_t *fobj; -+ cpt_object_t *pobj = NULL; -+ -+ err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, -+ &sock); -+ if (err) -+ return err; -+ -+ if (si->cpt_socketpair) { -+ err = sock_create_kern(si->cpt_family, si->cpt_type, -+ si->cpt_protocol, &sock2); -+ if (err) -+ goto err_out; -+ -+ err = sock->ops->socketpair(sock, sock2); -+ if (err < 0) -+ goto err_out; -+ -+ /* Socketpair with a peer outside our environment. -+ * So, we create real half-open pipe and do not worry -+ * about dead end anymore. */ -+ if (si->cpt_peer == -1) { -+ sock_release(sock2); -+ sock2 = NULL; -+ } -+ } -+ -+ cpt_obj_setobj(obj, sock->sk, ctx); -+ -+ if (si->cpt_file != CPT_NULL) { -+ file = sock_mapfile(sock); -+ err = PTR_ERR(file); -+ if (IS_ERR(file)) -+ goto err_out; -+ -+ err = -ENOMEM; -+ -+ obj->o_parent = file; -+ -+ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) -+ goto err_out; -+ cpt_obj_setpos(fobj, si->cpt_file, ctx); -+ cpt_obj_setindex(fobj, si->cpt_index, ctx); -+ } -+ -+ if (sock2) { -+ struct file *file2; -+ -+ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx); -+ if (!pobj) BUG(); -+ if (pobj->o_obj) BUG(); -+ cpt_obj_setobj(pobj, sock2->sk, ctx); -+ -+ if (pobj->o_ppos != CPT_NULL) { -+ file2 = sock_mapfile(sock2); -+ err = PTR_ERR(file2); -+ if (IS_ERR(file2)) -+ goto err_out; -+ -+ err = -ENOMEM; -+ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL) -+ goto err_out; -+ cpt_obj_setpos(fobj, pobj->o_ppos, ctx); -+ cpt_obj_setindex(fobj, si->cpt_peer, ctx); -+ -+ pobj->o_parent = file2; -+ } -+ } -+ -+ setup_sock_common(sock->sk, si, obj->o_pos, ctx); -+ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) { -+ inet_sk(sock->sk)->freebind = 1; -+ if (si->cpt_laddrlen) { -+ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); -+ if (err) { -+ dprintk_ctx("binding failed: %d, do not worry\n", err); -+ } -+ } -+ rst_socket_in(si, obj->o_pos, sock->sk, ctx); -+ } else if (sock->sk->sk_family == AF_NETLINK) { -+ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); -+ if (err) { -+ eprintk_ctx("AF_NETLINK binding failed: %d\n", err); -+ } -+ if (si->cpt_raddrlen) { -+ err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK); -+ if (err) { -+ eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err); -+ } -+ } -+ } -+ fixup_unix_address(sock, si, ctx); -+ -+ if (sock2) { -+ err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx); -+ if (err) -+ return err; -+ setup_sock_common(sock2->sk, si, pobj->o_pos, ctx); -+ fixup_unix_address(sock2, si, ctx); -+ } -+ -+ if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) -+ && (int)si->cpt_parent != -1) { -+ cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); -+ if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0) -+ sock->sk = NULL; -+ } -+ -+ -+ if (si->cpt_file == CPT_NULL && sock->sk && -+ sock->sk->sk_family == AF_INET) { -+ struct sock *sk = sock->sk; -+ -+ if (sk) { -+ sock->sk = NULL; -+ -+ local_bh_disable(); -+ bh_lock_sock(sk); -+ if (sock_owned_by_user(sk)) -+ eprintk_ctx("oops, sock is locked by user\n"); -+ -+ sock_hold(sk); -+ sock_orphan(sk); -+ ub_inc_orphan_count(sk); -+ bh_unlock_sock(sk); -+ local_bh_enable(); -+ sock_put(sk); -+ dprintk_ctx("orphaning socket %p\n", sk); -+ } -+ } -+ -+ if (si->cpt_file == CPT_NULL && sock->sk == NULL) -+ sock_release(sock); -+ -+ return 0; -+ -+err_out: -+ if (sock2) -+ sock_release(sock2); -+ sock_release(sock); -+ return err; -+} -+ -+static int open_listening_socket(loff_t pos, struct cpt_sock_image *si, -+ struct cpt_context *ctx) -+{ -+ int err; -+ struct socket *sock; -+ struct file *file; -+ cpt_object_t *obj, *fobj; -+ -+ err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, -+ &sock); -+ if (err) { -+ eprintk_ctx("open_listening_socket: sock_create_kern: %d\n", err); -+ return err; -+ } -+ -+ sock->sk->sk_reuse = 2; -+ sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if; -+ -+ if (sock->sk->sk_family == AF_UNIX) { -+ err = bind_unix_socket(sock, si, ctx); -+ } else if (si->cpt_laddrlen) { -+ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) -+ inet_sk(sock->sk)->freebind = 1; -+ -+ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); -+ -+ if (err) { -+ eprintk_ctx("open_listening_socket: bind: %d\n", err); -+ goto err_out; -+ } -+ } -+ -+ err = sock->ops->listen(sock, si->cpt_max_ack_backlog); -+ if (err) { -+ eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted); -+ goto err_out; -+ } -+ -+ /* Now we may access socket body directly and fixup all the things. */ -+ -+ file = sock_mapfile(sock); -+ err = PTR_ERR(file); -+ if (IS_ERR(file)) { -+ eprintk_ctx("open_listening_socket: map: %d\n", err); -+ goto err_out; -+ } -+ -+ err = -ENOMEM; -+ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) -+ goto err_out; -+ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL) -+ goto err_out; -+ cpt_obj_setpos(obj, pos, ctx); -+ cpt_obj_setindex(obj, si->cpt_index, ctx); -+ obj->o_parent = file; -+ cpt_obj_setpos(fobj, si->cpt_file, ctx); -+ cpt_obj_setindex(fobj, si->cpt_index, ctx); -+ -+ setup_sock_common(sock->sk, si, pos, ctx); -+ -+ if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) -+ rst_restore_synwait_queue(sock->sk, si, pos, ctx); -+ -+ return 0; -+ -+err_out: -+ sock_release(sock); -+ return err; -+} -+ -+ -+struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx) -+{ -+ int err; -+ struct sk_buff *skb; -+ struct cpt_skb_image v; -+ loff_t pos = *pos_p; -+ struct scm_fp_list *fpl = NULL; -+ struct timeval tmptv; -+ -+ err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx); -+ if (err) -+ return ERR_PTR(err); -+ *pos_p = pos + v.cpt_next; -+ -+ if (owner) -+ *owner = v.cpt_owner; -+ if (queue) -+ *queue = v.cpt_queue; -+ -+ skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL); -+ if (skb == NULL) -+ return ERR_PTR(-ENOMEM); -+ skb_reserve(skb, v.cpt_hspace); -+ skb_put(skb, v.cpt_len); -+ skb->h.raw = skb->head + v.cpt_h; -+ skb->nh.raw = skb->head + v.cpt_nh; -+ skb->mac.raw = skb->head + v.cpt_mac; -+ if (sizeof(skb->cb) < sizeof(v.cpt_cb)) BUG(); -+ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); -+ skb->mac_len = v.cpt_mac_len; -+ -+ skb->csum = v.cpt_csum; -+ skb->local_df = v.cpt_local_df; -+ skb->pkt_type = v.cpt_pkt_type; -+ skb->ip_summed = v.cpt_ip_summed; -+ skb->priority = v.cpt_priority; -+ skb->protocol = v.cpt_protocol; -+ cpt_timeval_import(&tmptv, v.cpt_stamp); -+ skb_set_timestamp(skb, &tmptv); -+ -+ skb_shinfo(skb)->tso_segs = v.cpt_tso_segs; -+ skb_shinfo(skb)->tso_size = v.cpt_tso_size; -+ if (ctx->image_version == 0) { -+ skb_shinfo(skb)->tso_segs = 1; -+ skb_shinfo(skb)->tso_size = 0; -+ } -+ -+ if (v.cpt_next > v.cpt_hdrlen) { -+ pos = pos + v.cpt_hdrlen; -+ while (pos < *pos_p) { -+ union { -+ struct cpt_obj_bits b; -+ struct cpt_fd_image f; -+ } u; -+ -+ err = rst_get_object(-1, pos, &u, ctx); -+ if (err) { -+ kfree_skb(skb); -+ return ERR_PTR(err); -+ } -+ if (u.b.cpt_object == CPT_OBJ_BITS) { -+ if (u.b.cpt_size != v.cpt_hspace + skb->len) { -+ eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len); -+ kfree_skb(skb); -+ return ERR_PTR(-EINVAL); -+ } -+ -+ err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen); -+ if (err) { -+ kfree_skb(skb); -+ return ERR_PTR(err); -+ } -+ } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) { -+ if (!fpl) { -+ fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); -+ if (!fpl) { -+ kfree_skb(skb); -+ return ERR_PTR(-ENOMEM); -+ } -+ fpl->count = 0; -+ UNIXCB(skb).fp = fpl; -+ } -+ fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx); -+ if (!IS_ERR(fpl->fp[fpl->count])) -+ fpl->count++; -+ } -+ pos += u.b.cpt_next; -+ } -+ } -+ -+ return skb; -+} -+ -+static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) -+{ -+ int i; -+ scm->fp = UNIXCB(skb).fp; -+ skb->destructor = sock_wfree; -+ UNIXCB(skb).fp = NULL; -+ -+ for (i=scm->fp->count-1; i>=0; i--) -+ unix_notinflight(scm->fp->fp[i]); -+} -+ -+static void unix_destruct_fds(struct sk_buff *skb) -+{ -+ struct scm_cookie scm; -+ memset(&scm, 0, sizeof(scm)); -+ unix_detach_fds(&scm, skb); -+ scm_destroy(&scm); -+ sock_wfree(skb); -+ module_put(THIS_MODULE); -+} -+ -+ -+static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si, -+ loff_t pos, struct cpt_context *ctx) -+{ -+ loff_t endpos; -+ -+ pos = pos + si->cpt_hdrlen; -+ endpos = pos + si->cpt_next; -+ while (pos < endpos) { -+ struct sk_buff *skb; -+ struct sock *owner_sk; -+ __u32 owner; -+ -+ skb = rst_skb(&pos, &owner, NULL, ctx); -+ if (IS_ERR(skb)) -+ return PTR_ERR(skb); -+ -+ owner_sk = unix_peer(sk); -+ if (owner != -1) { -+ cpt_object_t *pobj; -+ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx); -+ if (pobj == NULL) { -+ eprintk_ctx("orphan af_unix skb?\n"); -+ kfree_skb(skb); -+ continue; -+ } -+ owner_sk = pobj->o_obj; -+ } -+ if (owner_sk == NULL) { -+ dprintk_ctx("orphan af_unix skb 2?\n"); -+ kfree_skb(skb); -+ continue; -+ } -+ skb_set_owner_w(skb, owner_sk); -+ if (UNIXCB(skb).fp) { -+ skb->destructor = unix_destruct_fds; -+ if (!try_module_get(THIS_MODULE)) BUG(); -+ } -+ skb_queue_tail(&sk->sk_receive_queue, skb); -+ if (sk->sk_state == TCP_LISTEN) { -+ struct socket *sock = skb->sk->sk_socket; -+ if (sock == NULL) BUG(); -+ if (sock->file) BUG(); -+ skb->sk->sk_socket = NULL; -+ skb->sk->sk_sleep = NULL; -+ sock->sk = NULL; -+ sock_release(sock); -+ } -+ } -+ return 0; -+} -+ -+ -+/* All the sockets are created before we start to open files */ -+ -+int rst_sockets(struct cpt_context *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_SOCKET]; -+ loff_t endsec; -+ cpt_object_t *obj; -+ struct cpt_section_hdr h; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) { -+ eprintk_ctx("rst_sockets: ctx->pread: %d\n", err); -+ return err; -+ } -+ if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) { -+ eprintk_ctx("rst_sockets: hdr err\n"); -+ return -EINVAL; -+ } -+ -+ /* The first pass: we create socket index and open listening sockets. */ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ struct cpt_sock_image *sbuf = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); -+ if (err) { -+ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); -+ cpt_release_buf(ctx); -+ return err; -+ } -+ if (sbuf->cpt_state == TCP_LISTEN) { -+ err = open_listening_socket(sec, sbuf, ctx); -+ cpt_release_buf(ctx); -+ if (err) { -+ eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err); -+ return err; -+ } -+ } else { -+ cpt_release_buf(ctx); -+ obj = alloc_cpt_object(GFP_KERNEL, ctx); -+ if (obj == NULL) -+ return -ENOMEM; -+ cpt_obj_setindex(obj, sbuf->cpt_index, ctx); -+ cpt_obj_setpos(obj, sec, ctx); -+ obj->o_ppos = sbuf->cpt_file; -+ intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx); -+ } -+ sec += sbuf->cpt_next; -+ } -+ -+ /* Pass 2: really restore sockets */ -+ for_each_object(obj, CPT_OBJ_SOCKET) { -+ struct cpt_sock_image *sbuf; -+ if (obj->o_obj != NULL) -+ continue; -+ sbuf = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); -+ if (err) { -+ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); -+ cpt_release_buf(ctx); -+ return err; -+ } -+ if (sbuf->cpt_state == TCP_LISTEN) BUG(); -+ err = open_socket(obj, sbuf, ctx); -+ cpt_release_buf(ctx); -+ if (err) { -+ eprintk_ctx("rst_sockets: open_socket: %d\n", err); -+ return err; -+ } -+ } -+ -+ return 0; -+} -+ -+int rst_orphans(struct cpt_context *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_ORPHANS]; -+ loff_t endsec; -+ cpt_object_t *obj; -+ struct cpt_section_hdr h; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ struct cpt_sock_image *sbuf = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ obj = alloc_cpt_object(GFP_KERNEL, ctx); -+ if (obj == NULL) { -+ cpt_release_buf(ctx); -+ return -ENOMEM; -+ } -+ obj->o_pos = sec; -+ obj->o_ppos = sbuf->cpt_file; -+ err = open_socket(obj, sbuf, ctx); -+ dprintk_ctx("Restoring orphan: %d\n", err); -+ free_cpt_object(obj, ctx); -+ cpt_release_buf(ctx); -+ if (err) -+ return err; -+ sec += sbuf->cpt_next; -+ } -+ -+ return 0; -+} -+ -+ -+/* Pass 3: I understand, this is not funny already :-), -+ * but we have to do another pass to establish links between -+ * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX -+ * skb queues with proper skb->sk links. -+ * -+ * This could be made at the end of rst_sockets(), but we defer -+ * restoring af_unix queues up to the end of restoring files to -+ * make restoring passed FDs cleaner. -+ */ -+ -+int rst_sockets_complete(struct cpt_context *ctx) -+{ -+ int err; -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_SOCKET) { -+ struct cpt_sock_image *sbuf; -+ struct sock *sk = obj->o_obj; -+ struct sock *peer; -+ -+ if (!sk) BUG(); -+ -+ if (sk->sk_family != AF_UNIX) -+ continue; -+ -+ sbuf = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ -+ if (sbuf->cpt_next > sbuf->cpt_hdrlen) -+ restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx); -+ -+ cpt_release_buf(ctx); -+ -+ if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) { -+ cpt_object_t *pobj; -+ -+ sbuf = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ -+ if (sbuf->cpt_peer != -1) { -+ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx); -+ if (pobj) { -+ peer = pobj->o_obj; -+ sock_hold(peer); -+ unix_peer(sk) = peer; -+ } -+ } -+ cpt_release_buf(ctx); -+ } -+ } -+ -+ rst_orphans(ctx); -+ -+ return 0; -+} -+ -diff -upr linux-2.6.16.orig/kernel/cpt/rst_socket_in.c linux-2.6.16-026test009/kernel/cpt/rst_socket_in.c ---- linux-2.6.16.orig/kernel/cpt/rst_socket_in.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_socket_in.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,449 @@ -+/* -+ * -+ * kernel/cpt/rst_socket_in.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/fs.h> -+#include <linux/socket.h> -+#include <linux/tcp.h> -+#include <linux/jhash.h> -+#include <net/sock.h> -+#include <net/tcp.h> -+#include <linux/ipv6.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_socket.h" -+#include "cpt_kernel.h" -+ -+static inline unsigned long jiffies_import(__u32 tmo) -+{ -+ __s32 delta = tmo; -+ return jiffies + (long)delta; -+} -+ -+static inline __u32 tcp_jiffies_import(__u32 tmo) -+{ -+ return ((__u32)jiffies) + tmo; -+} -+ -+ -+static int restore_queues(struct sock *sk, struct cpt_sock_image *si, -+ loff_t pos, struct cpt_context *ctx) -+{ -+ loff_t endpos; -+ -+ pos = pos + si->cpt_hdrlen; -+ endpos = pos + si->cpt_next; -+ while (pos < endpos) { -+ struct sk_buff *skb; -+ __u32 type; -+ -+ skb = rst_skb(&pos, NULL, &type, ctx); -+ if (IS_ERR(skb)) -+ return PTR_ERR(skb); -+ -+ if (sk->sk_type == SOCK_STREAM) { -+ if (type == CPT_SKB_RQ) { -+ sk_stream_set_owner_r(skb, sk); -+ ub_tcprcvbuf_charge_forced(sk, skb); -+ skb_queue_tail(&sk->sk_receive_queue, skb); -+ } else if (type == CPT_SKB_OFOQ) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ sk_stream_set_owner_r(skb, sk); -+ ub_tcprcvbuf_charge_forced(sk, skb); -+ skb_queue_tail(&tp->out_of_order_queue, skb); -+ } else if (type == CPT_SKB_WQ) { -+ sk->sk_wmem_queued += skb->truesize; -+ sk->sk_forward_alloc -= skb->truesize; -+ ub_tcpsndbuf_charge_forced(sk, skb); -+ skb_queue_tail(&sk->sk_write_queue, skb); -+ } else { -+ wprintk_ctx("strange stream queue type %u\n", type); -+ kfree_skb(skb); -+ } -+ } else { -+ if (type == CPT_SKB_RQ) { -+ skb_set_owner_r(skb, sk); -+ skb_queue_tail(&sk->sk_receive_queue, skb); -+ } else if (type == CPT_SKB_WQ) { -+ struct inet_sock *inet = inet_sk(sk); -+ if (inet->cork.fragsize) { -+ skb_set_owner_w(skb, sk); -+ skb_queue_tail(&sk->sk_write_queue, skb); -+ } else { -+ eprintk_ctx("cork skb is dropped\n"); -+ kfree_skb(skb); -+ } -+ } else { -+ wprintk_ctx("strange dgram queue type %u\n", type); -+ kfree_skb(skb); -+ } -+ } -+ } -+ return 0; -+} -+ -+static struct sock *find_parent(__u16 sport, cpt_context_t *ctx) -+{ -+ cpt_object_t *obj; -+ for_each_object(obj, CPT_OBJ_SOCKET) { -+ struct sock *sk = obj->o_obj; -+ if (sk && -+ sk->sk_state == TCP_LISTEN && -+ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && -+ inet_sk(sk)->sport == sport) -+ return sk; -+ } -+ return NULL; -+} -+ -+static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk, -+ struct cpt_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sk_buff *skb; -+ tp->pred_flags = si->cpt_pred_flags; -+ tp->rcv_nxt = si->cpt_rcv_nxt; -+ tp->snd_nxt = si->cpt_snd_nxt; -+ tp->snd_una = si->cpt_snd_una; -+ tp->snd_sml = si->cpt_snd_sml; -+ tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); -+ tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); -+ tp->tcp_header_len = si->cpt_tcp_header_len; -+ inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending; -+ inet_csk(sk)->icsk_ack.quick = si->cpt_quick; -+ inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong; -+ inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked; -+ inet_csk(sk)->icsk_ack.ato = si->cpt_ato; -+ inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout); -+ inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime); -+ inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size; -+ inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss; -+ tp->snd_wl1 = si->cpt_snd_wl1; -+ tp->snd_wnd = si->cpt_snd_wnd; -+ tp->max_window = si->cpt_max_window; -+ inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie; -+ tp->mss_cache = si->cpt_mss_cache; -+ tp->rx_opt.mss_clamp = si->cpt_mss_clamp; -+ inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; -+ inet_csk(sk)->icsk_ca_state = si->cpt_ca_state; -+ inet_csk(sk)->icsk_retransmits = si->cpt_retransmits; -+ tp->reordering = si->cpt_reordering; -+ tp->frto_counter = si->cpt_frto_counter; -+ tp->frto_highmark = si->cpt_frto_highmark; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) -+ // // tp->adv_cong = si->cpt_adv_cong; -+#endif -+ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; -+ inet_csk(sk)->icsk_backoff = si->cpt_backoff; -+ tp->srtt = si->cpt_srtt; -+ tp->mdev = si->cpt_mdev; -+ tp->mdev_max = si->cpt_mdev_max; -+ tp->rttvar = si->cpt_rttvar; -+ tp->rtt_seq = si->cpt_rtt_seq; -+ inet_csk(sk)->icsk_rto = si->cpt_rto; -+ tp->packets_out = si->cpt_packets_out; -+ tp->left_out = si->cpt_left_out; -+ tp->retrans_out = si->cpt_retrans_out; -+ tp->lost_out = si->cpt_lost_out; -+ tp->sacked_out = si->cpt_sacked_out; -+ tp->fackets_out = si->cpt_fackets_out; -+ tp->snd_ssthresh = si->cpt_snd_ssthresh; -+ tp->snd_cwnd = si->cpt_snd_cwnd; -+ tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt; -+ tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp; -+ tp->snd_cwnd_used = si->cpt_snd_cwnd_used; -+ tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp); -+ inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout); -+ tp->rcv_wnd = si->cpt_rcv_wnd; -+ tp->rcv_wup = si->cpt_rcv_wup; -+ tp->write_seq = si->cpt_write_seq; -+ tp->pushed_seq = si->cpt_pushed_seq; -+ tp->copied_seq = si->cpt_copied_seq; -+ tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok; -+ tp->rx_opt.wscale_ok = si->cpt_wscale_ok; -+ tp->rx_opt.sack_ok = si->cpt_sack_ok; -+ tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp; -+ tp->rx_opt.snd_wscale = si->cpt_snd_wscale; -+ tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale; -+ tp->nonagle = si->cpt_nonagle; -+ tp->keepalive_probes = si->cpt_keepalive_probes; -+ tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval; -+ tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr; -+ tp->rx_opt.ts_recent = si->cpt_ts_recent; -+ tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp; -+ tp->rx_opt.user_mss = si->cpt_user_mss; -+ tp->rx_opt.dsack = si->cpt_dsack; -+ tp->rx_opt.eff_sacks = si->cpt_num_sacks; -+ tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0]; -+ tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1]; -+ tp->selective_acks[0].start_seq = si->cpt_sack_array[2]; -+ tp->selective_acks[0].end_seq = si->cpt_sack_array[3]; -+ tp->selective_acks[1].start_seq = si->cpt_sack_array[4]; -+ tp->selective_acks[1].end_seq = si->cpt_sack_array[5]; -+ tp->selective_acks[2].start_seq = si->cpt_sack_array[6]; -+ tp->selective_acks[2].end_seq = si->cpt_sack_array[7]; -+ tp->selective_acks[3].start_seq = si->cpt_sack_array[8]; -+ tp->selective_acks[3].end_seq = si->cpt_sack_array[9]; -+ -+ tp->window_clamp = si->cpt_window_clamp; -+ tp->rcv_ssthresh = si->cpt_rcv_ssthresh; -+ inet_csk(sk)->icsk_probes_out = si->cpt_probes_out; -+ tp->rx_opt.num_sacks = si->cpt_num_sacks; -+ tp->advmss = si->cpt_advmss; -+ inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; -+ tp->ecn_flags = si->cpt_ecn_flags; -+ tp->prior_ssthresh = si->cpt_prior_ssthresh; -+ tp->high_seq = si->cpt_high_seq; -+ tp->retrans_stamp = si->cpt_retrans_stamp; -+ tp->undo_marker = si->cpt_undo_marker; -+ tp->undo_retrans = si->cpt_undo_retrans; -+ tp->urg_seq = si->cpt_urg_seq; -+ tp->urg_data = si->cpt_urg_data; -+ inet_csk(sk)->icsk_pending = si->cpt_pending; -+ tp->urg_mode = si->cpt_urg_mode; -+ tp->snd_up = si->cpt_snd_up; -+ tp->keepalive_time = si->cpt_keepalive_time; -+ tp->keepalive_intvl = si->cpt_keepalive_intvl; -+ tp->linger2 = si->cpt_linger2; -+ -+ sk->sk_send_head = NULL; -+ for (skb = skb_peek(&sk->sk_write_queue); -+ skb && skb != (struct sk_buff*)&sk->sk_write_queue; -+ skb = skb->next) { -+ if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) { -+ sk->sk_send_head = skb; -+ break; -+ } -+ } -+ -+ if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) { -+ struct inet_sock *inet = inet_sk(sk); -+ if (inet->num == 0) { -+ cpt_object_t *lobj = NULL; -+ -+ if ((int)si->cpt_parent != -1) -+ lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); -+ -+ if (lobj && lobj->o_obj) { -+ inet->num = ntohs(inet->sport); -+ local_bh_disable(); -+ __inet_inherit_port(&tcp_hashinfo, lobj->o_obj, sk); -+ local_bh_enable(); -+ dprintk_ctx("port inherited from parent\n"); -+ } else { -+ struct sock *lsk = find_parent(inet->sport, ctx); -+ if (lsk) { -+ inet->num = ntohs(inet->sport); -+ local_bh_disable(); -+ __inet_inherit_port(&tcp_hashinfo, lsk, sk); -+ local_bh_enable(); -+ dprintk_ctx("port inherited\n"); -+ } else { -+ eprintk_ctx("we are kinda lost...\n"); -+ } -+ } -+ } -+ -+ sk->sk_prot->hash(sk); -+ -+ if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER) -+ sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout); -+ if (inet_csk(sk)->icsk_pending) -+ sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer, -+ inet_csk(sk)->icsk_timeout); -+ if (sock_flag(sk, SOCK_KEEPOPEN)) { -+ unsigned long expires = jiffies_import(si->cpt_ka_timeout); -+ if (time_after(jiffies, expires)) -+ expires = jiffies + HZ; -+ sk_reset_timer(sk, &sk->sk_timer, expires); -+ } -+ } -+ -+ return 0; -+} -+ -+ -+int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk, -+ struct cpt_context *ctx) -+{ -+ struct inet_sock *inet = inet_sk(sk); -+ -+ lock_sock(sk); -+ -+ sk->sk_state = si->cpt_state; -+ -+ inet->daddr = si->cpt_daddr; -+ inet->dport = si->cpt_dport; -+ inet->saddr = si->cpt_saddr; -+ inet->rcv_saddr = si->cpt_rcv_saddr; -+ inet->sport = si->cpt_sport; -+ inet->uc_ttl = si->cpt_uc_ttl; -+ inet->tos = si->cpt_tos; -+ inet->cmsg_flags = si->cpt_cmsg_flags; -+ inet->mc_index = si->cpt_mc_index; -+ inet->mc_addr = si->cpt_mc_addr; -+ inet->hdrincl = si->cpt_hdrincl; -+ inet->mc_ttl = si->cpt_mc_ttl; -+ inet->mc_loop = si->cpt_mc_loop; -+ inet->pmtudisc = si->cpt_pmtudisc; -+ inet->recverr = si->cpt_recverr; -+ inet->freebind = si->cpt_freebind; -+ inet->id = si->cpt_idcounter; -+ -+ inet->cork.flags = si->cpt_cork_flags; -+ inet->cork.fragsize = si->cpt_cork_fragsize; -+ inet->cork.length = si->cpt_cork_length; -+ inet->cork.addr = si->cpt_cork_addr; -+ inet->cork.fl.fl4_src = si->cpt_cork_saddr; -+ inet->cork.fl.fl4_dst = si->cpt_cork_daddr; -+ inet->cork.fl.oif = si->cpt_cork_oif; -+ if (inet->cork.fragsize) { -+ if (ip_route_output_key(&inet->cork.rt, &inet->cork.fl)) { -+ eprintk_ctx("failed to restore cork route\n"); -+ inet->cork.fragsize = 0; -+ } -+ } -+ -+ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { -+ struct udp_sock *up = udp_sk(sk); -+ up->pending = si->cpt_udp_pending; -+ up->corkflag = si->cpt_udp_corkflag; -+ up->encap_type = si->cpt_udp_encap; -+ up->len = si->cpt_udp_len; -+ } -+ -+ if (sk->sk_family == AF_INET6) { -+ struct ipv6_pinfo *np = inet6_sk(sk); -+ -+ memcpy(&np->saddr, si->cpt_saddr6, 16); -+ memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16); -+ memcpy(&np->daddr, si->cpt_daddr6, 16); -+ np->flow_label = si->cpt_flow_label6; -+ np->frag_size = si->cpt_frag_size6; -+ np->hop_limit = si->cpt_hop_limit6; -+ np->mcast_hops = si->cpt_mcast_hops6; -+ np->mcast_oif = si->cpt_mcast_oif6; -+ np->rxopt.all = si->cpt_rxopt6; -+ np->mc_loop = si->cpt_mc_loop6; -+ np->recverr = si->cpt_recverr6; -+ np->sndflow = si->cpt_sndflow6; -+ np->pmtudisc = si->cpt_pmtudisc6; -+ np->ipv6only = si->cpt_ipv6only6; -+ -+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+ if (si->cpt_mapped) { -+ if (sk->sk_type == SOCK_STREAM && -+ sk->sk_protocol == IPPROTO_TCP) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ tp->af_specific = &ipv6_mapped; -+ sk->sk_backlog_rcv = tcp_v4_do_rcv; -+ } -+ } -+#endif -+ } -+ -+ restore_queues(sk, si, pos, ctx); -+ -+ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) -+ rst_socket_tcp(si, pos, sk, ctx); -+ -+ release_sock(sk); -+ return 0; -+} -+ -+int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx) -+{ -+ struct request_sock *req; -+ -+ if (lsk->sk_state != TCP_LISTEN) -+ return -EINVAL; -+ -+ req = reqsk_alloc(&tcp_request_sock_ops); -+ if (!req) -+ return -ENOMEM; -+ -+ sk->sk_socket = NULL; -+ sk->sk_sleep = NULL; -+ inet_csk_reqsk_queue_add(lsk, req, sk); -+ return 0; -+} -+ -+static __inline__ u32 __tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) -+{ -+ return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); -+} -+ -+int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, -+ loff_t pos, struct cpt_context *ctx) -+{ -+ int err; -+ loff_t end = si->cpt_next; -+ -+ pos += si->cpt_hdrlen; -+ while (pos < end) { -+ struct cpt_openreq_image oi; -+ -+ err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx); -+ if (err) -+ return err; -+ -+ if (oi.cpt_object == CPT_OBJ_OPENREQ) { -+ struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops); -+ if (req == NULL) -+ return -ENOMEM; -+ -+ memset(req, 0, sizeof(*req)); -+ tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn; -+ tcp_rsk(req)->snt_isn = oi.cpt_snt_isn; -+ inet_rsk(req)->rmt_port = oi.cpt_rmt_port; -+ req->mss = oi.cpt_mss; -+ req->retrans = oi.cpt_retrans; -+ inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale; -+ inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale; -+ inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok; -+ inet_rsk(req)->sack_ok = oi.cpt_sack_ok; -+ inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok; -+ inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok; -+ inet_rsk(req)->acked = oi.cpt_acked; -+ req->window_clamp = oi.cpt_window_clamp; -+ req->rcv_wnd = oi.cpt_rcv_wnd; -+ req->ts_recent = oi.cpt_ts_recent; -+ req->expires = jiffies_import(oi.cpt_expires); -+ -+ if (oi.cpt_family == AF_INET) { -+ memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); -+ memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); -+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -+ } else { -+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -+ memcpy(&req->af.v6_req.loc_addr, oi.cpt_loc_addr, 16); -+ memcpy(&req->af.v6_req.rmt_addr, oi.cpt_rmt_addr, 16); -+ req->af.v6_req.iif = oi.cpt_iif; -+ req->class = &or_ipv6; -+ tcp_v6_synq_add(sk, req); -+#endif -+ } -+ } -+ pos += oi.cpt_next; -+ } -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_sysvipc.c linux-2.6.16-026test009/kernel/cpt/rst_sysvipc.c ---- linux-2.6.16.orig/kernel/cpt/rst_sysvipc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_sysvipc.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,409 @@ -+/* -+ * -+ * kernel/cpt/rst_sysvipc.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/major.h> -+#include <linux/pipe_fs_i.h> -+#include <linux/mman.h> -+#include <linux/shm.h> -+/* FIXME. x86_64 has asm/ipc.h forgotten? */ -+#include <asm-generic/ipc.h> -+#include <asm/uaccess.h> -+#include <asm/unistd.h> -+#include <ub/ub_mem.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_kernel.h" -+ -+struct _warg { -+ struct file *file; -+ struct cpt_sysvshm_image *v; -+}; -+ -+static int fixup_one_shm(struct shmid_kernel *shp, void *arg) -+{ -+ struct _warg *warg = arg; -+ -+ if (shp->shm_file != warg->file) -+ return 0; -+ if (shp->shm_nattch) -+ return -EEXIST; -+ -+ shp->shm_perm.uid = warg->v->cpt_uid; -+ shp->shm_perm.gid = warg->v->cpt_gid; -+ shp->shm_perm.cuid = warg->v->cpt_cuid; -+ shp->shm_perm.cgid = warg->v->cpt_cgid; -+ shp->shm_perm.mode = warg->v->cpt_mode; -+ -+ shp->shm_atim = warg->v->cpt_atime; -+ shp->shm_dtim = warg->v->cpt_dtime; -+ shp->shm_ctim = warg->v->cpt_ctime; -+ shp->shm_cprid = warg->v->cpt_creator; -+ shp->shm_lprid = warg->v->cpt_last; -+ -+ /* TODO: fix shp->mlock_user? */ -+ return 1; -+} -+ -+static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v) -+{ -+ struct _warg warg; -+ -+ warg.file = file; -+ warg.v = v; -+ -+ return sysvipc_walk_shm(fixup_one_shm, &warg); -+} -+ -+static int fixup_shm_data(struct file *file, loff_t pos, loff_t end, -+ struct cpt_context *ctx) -+{ -+ struct cpt_page_block pgb; -+ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); -+ -+ do_write = file->f_dentry->d_inode->i_fop->write; -+ if (do_write == NULL) { -+ eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n"); -+ return -EINVAL; -+ } -+ -+ while (pos < end) { -+ loff_t opos; -+ loff_t ipos; -+ int count; -+ int err; -+ -+ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); -+ if (err) -+ return err; -+ dprintk_ctx("restoring SHM block: %08x-%08x\n", -+ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); -+ ipos = pos + pgb.cpt_hdrlen; -+ opos = pgb.cpt_start; -+ count = pgb.cpt_end-pgb.cpt_start; -+ while (count > 0) { -+ mm_segment_t oldfs; -+ int copy = count; -+ -+ if (copy > PAGE_SIZE) -+ copy = PAGE_SIZE; -+ (void)cpt_get_buf(ctx); -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); -+ set_fs(oldfs); -+ if (err) { -+ __cpt_release_buf(ctx); -+ return err; -+ } -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ ipos += copy; -+ err = do_write(file, ctx->tmpbuf, copy, &opos); -+ set_fs(oldfs); -+ __cpt_release_buf(ctx); -+ if (err != copy) { -+ eprintk_ctx("write() failure\n"); -+ if (err >= 0) -+ err = -EIO; -+ return err; -+ } -+ count -= copy; -+ } -+ pos += pgb.cpt_next; -+ } -+ return 0; -+} -+ -+struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx) -+{ -+ struct file *file; -+ int err; -+ loff_t dpos, epos; -+ union { -+ struct cpt_file_image fi; -+ struct cpt_sysvshm_image shmi; -+ struct cpt_inode_image ii; -+ } u; -+ -+ err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); -+ if (err < 0) -+ goto err_out; -+ pos = u.fi.cpt_inode; -+ err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); -+ if (err < 0) -+ goto err_out; -+ dpos = pos + u.ii.cpt_hdrlen; -+ epos = pos + u.ii.cpt_next; -+ err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); -+ if (err < 0) -+ goto err_out; -+ dpos += u.shmi.cpt_next; -+ -+ file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id, -+ u.shmi.cpt_segsz, u.shmi.cpt_mode); -+ if (!IS_ERR(file)) { -+ err = fixup_shm(file, &u.shmi); -+ if (err != -EEXIST && dpos < epos) -+ err = fixup_shm_data(file, dpos, epos, ctx); -+ } -+ -+ return file; -+ -+err_out: -+ return ERR_PTR(err); -+} -+ -+static int attach_one_undo(int semid, struct sem_array *sma, void *arg) -+{ -+ struct sem_undo *su = arg; -+ struct sem_undo_list *undo_list = current->sysvsem.undo_list; -+ -+ if (semid != su->semid) -+ return 0; -+ -+ su->proc_next = undo_list->proc_list; -+ undo_list->proc_list = su; -+ -+ su->id_next = sma->undo; -+ sma->undo = su; -+ -+ return 1; -+} -+ -+static int attach_undo(struct sem_undo *su) -+{ -+ return sysvipc_walk_sem(attach_one_undo, su); -+} -+ -+static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx) -+{ -+ int err; -+ struct sem_undo_list *undo_list; -+ -+ if (current->sysvsem.undo_list) { -+ eprintk_ctx("Funny undo_list\n"); -+ return 0; -+ } -+ -+ undo_list = ub_kmalloc(sizeof(struct sem_undo_list), GFP_KERNEL); -+ if (undo_list == NULL) -+ return -ENOMEM; -+ memset(undo_list, 0, sizeof(struct sem_undo_list)); -+ atomic_set(&undo_list->refcnt, 1); -+ spin_lock_init(&undo_list->lock); -+ current->sysvsem.undo_list = undo_list; -+ -+ if (sui->cpt_next > sui->cpt_hdrlen) { -+ loff_t offset = pos + sui->cpt_hdrlen; -+ do { -+ struct sem_undo *new; -+ struct cpt_sysvsem_undo_image spi; -+ err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx); -+ if (err) -+ goto out; -+ new = ub_kmalloc(sizeof(struct sem_undo) + -+ sizeof(short)*spi.cpt_nsem, GFP_KERNEL); -+ if (!new) { -+ err = -ENOMEM; -+ goto out; -+ } -+ -+ memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem); -+ new->semadj = (short *) &new[1]; -+ new->semid = spi.cpt_id; -+ err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen); -+ if (err) { -+ kfree(new); -+ goto out; -+ } -+ err = attach_undo(new); -+ if (err <= 0) { -+ if (err == 0) -+ err = -ENOENT; -+ kfree(new); -+ goto out; -+ } -+ offset += spi.cpt_next; -+ } while (offset < pos + sui->cpt_next); -+ } -+ err = 0; -+ -+out: -+ return err; -+} -+ -+__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ __u32 flag = 0; -+ -+#if 0 -+ if (ti->cpt_sysvsem_undo == CPT_NULL || -+ lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo)) -+ flag |= CLONE_SYSVSEM; -+#endif -+ return flag; -+} -+ -+int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ int err; -+ struct sem_undo_list *f = current->sysvsem.undo_list; -+ cpt_object_t *obj; -+ struct cpt_object_hdr sui; -+ -+ if (ti->cpt_sysvsem_undo == CPT_NULL) { -+ exit_sem(current); -+ return 0; -+ } -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx); -+ if (obj) { -+ if (obj->o_obj != f) { -+ exit_sem(current); -+ f = obj->o_obj; -+ atomic_inc(&f->refcnt); -+ current->sysvsem.undo_list = f; -+ } -+ return 0; -+ } -+ -+ if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0) -+ goto out; -+ -+ if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0) -+ goto out; -+ -+ err = -ENOMEM; -+ obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx); -+ if (obj) { -+ err = 0; -+ cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx); -+ } -+ -+ return 0; -+ -+out: -+ return err; -+} -+ -+struct _sarg { -+ int semid; -+ struct cpt_sysvsem_image *v; -+ __u32 *arr; -+}; -+ -+static int fixup_one_sem(int semid, struct sem_array *sma, void *arg) -+{ -+ struct _sarg *warg = arg; -+ -+ if (semid != warg->semid) -+ return 0; -+ -+ sma->sem_perm.uid = warg->v->cpt_uid; -+ sma->sem_perm.gid = warg->v->cpt_gid; -+ sma->sem_perm.cuid = warg->v->cpt_cuid; -+ sma->sem_perm.cgid = warg->v->cpt_cgid; -+ sma->sem_perm.mode = warg->v->cpt_mode; -+ sma->sem_perm.seq = warg->v->cpt_seq; -+ -+ sma->sem_ctime = warg->v->cpt_ctime; -+ sma->sem_otime = warg->v->cpt_otime; -+ memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8); -+ return 1; -+} -+ -+static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr) -+{ -+ struct _sarg warg; -+ -+ warg.semid = semid; -+ warg.v = v; -+ warg.arr = arr; -+ -+ return sysvipc_walk_sem(fixup_one_sem, &warg); -+} -+ -+ -+static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si, -+ struct cpt_context *ctx) -+{ -+ int err; -+ __u32 *arr; -+ int nsems = (si->cpt_next - si->cpt_hdrlen)/8; -+ -+ arr = kmalloc(nsems*8, GFP_KERNEL); -+ if (!arr) -+ return -ENOMEM; -+ -+ err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen); -+ if (err) -+ goto out; -+ err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode); -+ if (err < 0) { -+ eprintk_ctx("SEM 3\n"); -+ goto out; -+ } -+ err = fixup_sem(si->cpt_id, si, arr); -+ if (err == 0) -+ err = -ESRCH; -+ if (err > 0) -+ err = 0; -+out: -+ kfree(arr); -+ return err; -+} -+ -+static int rst_sysv_sem(struct cpt_context *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ struct cpt_sysvsem_image sbuf; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ int err; -+ err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx); -+ if (err) -+ return err; -+ err = restore_sem(sec, &sbuf, ctx); -+ if (err) -+ return err; -+ sec += sbuf.cpt_next; -+ } -+ return 0; -+} -+ -+int rst_sysv_ipc(struct cpt_context *ctx) -+{ -+ return rst_sysv_sem(ctx); -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_tty.c linux-2.6.16-026test009/kernel/cpt/rst_tty.c ---- linux-2.6.16.orig/kernel/cpt/rst_tty.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_tty.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,380 @@ -+/* -+ * -+ * kernel/cpt/rst_tty.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/major.h> -+#include <linux/pipe_fs_i.h> -+#include <linux/mman.h> -+#include <linux/mount.h> -+#include <linux/tty.h> -+#include <linux/vmalloc.h> -+#include <asm/unistd.h> -+#include <asm/uaccess.h> -+#include <linux/cpt_image.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_mm.h" -+#include "cpt_files.h" -+#include "cpt_kernel.h" -+ -+static int pty_setup(struct tty_struct *stty, loff_t pos, -+ struct cpt_tty_image *pi, struct cpt_context *ctx) -+{ -+ unsigned long flags; -+ -+ stty->pgrp = -1; -+ stty->session = 0; -+ stty->packet = pi->cpt_packet; -+ stty->stopped = pi->cpt_stopped; -+ stty->hw_stopped = pi->cpt_hw_stopped; -+ stty->flow_stopped = pi->cpt_flow_stopped; -+#define DONOT_CHANGE ((1<<TTY_CHARGED)|(1<<TTY_CLOSING)) -+ flags = stty->flags & DONOT_CHANGE; -+ stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE); -+ stty->ctrl_status = pi->cpt_ctrl_status; -+ stty->winsize.ws_row = pi->cpt_ws_row; -+ stty->winsize.ws_col = pi->cpt_ws_col; -+ stty->winsize.ws_ypixel = pi->cpt_ws_prow; -+ stty->winsize.ws_xpixel = pi->cpt_ws_pcol; -+ stty->canon_column = pi->cpt_canon_column; -+ stty->column = pi->cpt_column; -+ stty->raw = pi->cpt_raw; -+ stty->real_raw = pi->cpt_real_raw; -+ stty->erasing = pi->cpt_erasing; -+ stty->lnext = pi->cpt_lnext; -+ stty->icanon = pi->cpt_icanon; -+ stty->closing = pi->cpt_closing; -+ stty->minimum_to_wake = pi->cpt_minimum_to_wake; -+ -+ stty->termios->c_iflag = pi->cpt_c_iflag; -+ stty->termios->c_oflag = pi->cpt_c_oflag; -+ stty->termios->c_lflag = pi->cpt_c_lflag; -+ stty->termios->c_cflag = pi->cpt_c_cflag; -+ memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS); -+ memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags)); -+ -+ if (pi->cpt_next > pi->cpt_hdrlen) { -+ int err; -+ struct cpt_obj_bits b; -+ err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx); -+ if (err) -+ return err; -+ if (b.cpt_size == 0) -+ return 0; -+ err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen); -+ if (err) -+ return err; -+ -+ spin_lock_irq(&stty->read_lock); -+ stty->read_tail = 0; -+ stty->read_cnt = b.cpt_size; -+ stty->read_head = b.cpt_size; -+ stty->canon_head = stty->read_tail + pi->cpt_canon_head; -+ stty->canon_data = pi->cpt_canon_data; -+ spin_unlock_irq(&stty->read_lock); -+ } -+ -+ return 0; -+} -+ -+/* Find slave/master tty in image, when we already know master/slave. -+ * It might be optimized, of course. */ -+static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_TTY]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ struct cpt_tty_image *pibuf; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return CPT_NULL; -+ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) -+ return CPT_NULL; -+ pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL); -+ if (pibuf == NULL) { -+ eprintk_ctx("cannot allocate buffer\n"); -+ return CPT_NULL; -+ } -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) -+ return CPT_NULL; -+ if (pibuf->cpt_index == pi->cpt_index && -+ !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) && -+ pos != sec) { -+ pty_setup(stty, sec, pibuf, ctx); -+ return sec; -+ } -+ sec += pibuf->cpt_next; -+ } -+ kfree(pibuf); -+ return CPT_NULL; -+} -+ -+static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master, -+ struct cpt_context *ctx) -+{ -+ int err; -+ struct iattr newattrs; -+ struct dentry *d = master->f_dentry; -+ -+ newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; -+ newattrs.ia_uid = ii->cpt_uid; -+ newattrs.ia_gid = ii->cpt_gid; -+ newattrs.ia_mode = ii->cpt_mode; -+ -+ mutex_lock(&d->d_inode->i_mutex); -+ err = notify_change(d, &newattrs); -+ mutex_unlock(&d->d_inode->i_mutex); -+ -+ return err; -+} -+ -+/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open -+ * /dev/ptmx until we get pty with desired index. -+ */ -+ -+struct file *ptmx_open(int index, unsigned int flags) -+{ -+ struct file *file; -+ struct file **stack = NULL; -+ int depth = 0; -+ -+ for (;;) { -+ struct tty_struct *tty; -+ -+ file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); -+ if (IS_ERR(file)) -+ break; -+ tty = file->private_data; -+ if (tty->index == index) -+ break; -+ -+ if (depth == PAGE_SIZE/sizeof(struct file *)) { -+ fput(file); -+ file = ERR_PTR(-EBUSY); -+ break; -+ } -+ if (stack == NULL) { -+ stack = (struct file **)__get_free_page(GFP_KERNEL); -+ if (!stack) { -+ fput(file); -+ file = ERR_PTR(-ENOMEM); -+ break; -+ } -+ } -+ stack[depth] = file; -+ depth++; -+ } -+ while (depth > 0) { -+ depth--; -+ fput(stack[depth]); -+ } -+ if (stack) -+ free_page((unsigned long)stack); -+ return file; -+} -+ -+ -+struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, -+ unsigned flags, struct cpt_context *ctx) -+{ -+ int err; -+ cpt_object_t *obj; -+ struct file *master, *slave; -+ struct tty_struct *stty; -+ struct cpt_tty_image *pi; -+ static char *a = "pqrstuvwxyzabcde"; -+ static char *b = "0123456789abcdef"; -+ char pairname[16]; -+ unsigned master_flags, slave_flags; -+ -+ if (fi->cpt_priv == CPT_NULL) -+ return ERR_PTR(-EINVAL); -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx); -+ if (obj && obj->o_parent) { -+ dprintk_ctx("obtained pty as pair to existing\n"); -+ master = obj->o_parent; -+ stty = master->private_data; -+ -+ if (stty->driver->subtype == PTY_TYPE_MASTER && -+ (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) { -+ wprintk_ctx("cloning ptmx\n"); -+ get_file(master); -+ return master; -+ } -+ -+ master = dentry_open(dget(master->f_dentry), -+ mntget(master->f_vfsmnt), flags); -+ if (!IS_ERR(master)) { -+ stty = master->private_data; -+ if (stty->driver->subtype != PTY_TYPE_MASTER) -+ fixup_tty_attrs(ii, master, ctx); -+ } -+ return master; -+ } -+ -+ pi = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx); -+ if (err) { -+ cpt_release_buf(ctx); -+ return ERR_PTR(err); -+ } -+ -+ master_flags = slave_flags = 0; -+ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) -+ master_flags = flags; -+ else -+ slave_flags = flags; -+ -+ /* -+ * Open pair master/slave. -+ */ -+ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) { -+ master = ptmx_open(pi->cpt_index, master_flags); -+ } else { -+ sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]); -+ master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); -+ } -+ if (IS_ERR(master)) { -+ eprintk_ctx("filp_open master: %Ld %ld\n", fi->cpt_priv, PTR_ERR(master)); -+ cpt_release_buf(ctx); -+ return master; -+ } -+ stty = master->private_data; -+ clear_bit(TTY_PTY_LOCK, &stty->flags); -+ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) -+ sprintf(pairname, "/dev/pts/%d", stty->index); -+ else -+ sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]); -+ slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); -+ if (IS_ERR(slave)) { -+ eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave)); -+ fput(master); -+ cpt_release_buf(ctx); -+ return slave; -+ } -+ -+ if (pi->cpt_drv_subtype != PTY_TYPE_MASTER) -+ fixup_tty_attrs(ii, slave, ctx); -+ -+ cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx); -+ cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx); -+ cpt_object_add(CPT_OBJ_FILE, master, ctx); -+ cpt_object_add(CPT_OBJ_FILE, slave, ctx); -+ -+ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) { -+ loff_t pos; -+ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); -+ obj->o_parent = master; -+ cpt_obj_setpos(obj, fi->cpt_priv, ctx); -+ pty_setup(stty, fi->cpt_priv, pi, ctx); -+ -+ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); -+ obj->o_parent = slave; -+ pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx); -+ cpt_obj_setpos(obj, pos, ctx); -+ -+ obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx); -+ cpt_obj_setpos(obj, CPT_NULL, ctx); -+ get_file(master); -+ cpt_release_buf(ctx); -+ return master; -+ } else { -+ loff_t pos; -+ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); -+ obj->o_parent = slave; -+ cpt_obj_setpos(obj, fi->cpt_priv, ctx); -+ pty_setup(stty->link, fi->cpt_priv, pi, ctx); -+ -+ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); -+ obj->o_parent = master; -+ pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx); -+ cpt_obj_setpos(obj, pos, ctx); -+ -+ obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx); -+ cpt_obj_setpos(obj, CPT_NULL, ctx); -+ get_file(slave); -+ cpt_release_buf(ctx); -+ return slave; -+ } -+} -+ -+int rst_tty_jobcontrol(struct cpt_context *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_TTY]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ cpt_object_t *obj; -+ struct cpt_tty_image *pibuf = cpt_get_buf(ctx); -+ -+ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) { -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx); -+ if (obj) { -+ struct tty_struct *stty = obj->o_obj; -+ if ((int)pibuf->cpt_pgrp > 0) { -+ stty->pgrp = vpid_to_pid(pibuf->cpt_pgrp); -+ if (stty->pgrp == -1) -+ dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp); -+ } else if (pibuf->cpt_pgrp) { -+ stty->pgrp = alloc_pidmap(); -+ if (stty->pgrp < 0) { -+ eprintk_ctx("cannot allocate stray tty->pgrp"); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ free_pidmap(stty->pgrp); -+ } -+ if ((int)pibuf->cpt_session > 0) { -+ int sess; -+ sess = vpid_to_pid(pibuf->cpt_session); -+ if (sess == -1) { -+ dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session); -+ } else if (stty->session <= 0) { -+ stty->session = sess; -+ } else if (stty->session != sess) { -+ wprintk_ctx("tty session mismatch 2\n"); -+ } -+ } -+ } -+ sec += pibuf->cpt_next; -+ cpt_release_buf(ctx); -+ } -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_ubc.c linux-2.6.16-026test009/kernel/cpt/rst_ubc.c ---- linux-2.6.16.orig/kernel/cpt/rst_ubc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_ubc.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,108 @@ -+/* -+ * -+ * kernel/cpt/rst_ubc.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/types.h> -+#include <ub/beancounter.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+ -+struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx); -+ if (obj == NULL) { -+ printk(KERN_ERR "RST: unknown ub @%Lu\n", pos); -+ return get_beancounter(get_exec_ub()); -+ } -+ return get_beancounter(obj->o_obj); -+} -+ -+static void restore_one_bc_parm(__u64 *dmp, struct ubparm *prm, int held) -+{ -+ prm->barrier = (dmp[0] == CPT_NULL ? UB_MAXVALUE : dmp[0]); -+ prm->limit = (dmp[1] == CPT_NULL ? UB_MAXVALUE : dmp[1]); -+ if (held) -+ prm->held = dmp[2]; -+ prm->maxheld = dmp[3]; -+ prm->minheld = dmp[4]; -+ prm->failcnt = dmp[5]; -+} -+ -+static int restore_one_bc(struct cpt_beancounter_image *v, -+ cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ struct user_beancounter *bc; -+ cpt_object_t *pobj; -+ int i; -+ -+ if (v->cpt_parent != CPT_NULL) { -+ pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx); -+ if (pobj == NULL) -+ return -ESRCH; -+ bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1); -+ } else { -+ bc = get_exec_ub(); -+ while (bc->parent) -+ bc = bc->parent; -+ get_beancounter(bc); -+ } -+ if (bc == NULL) -+ return -ENOMEM; -+ obj->o_obj = bc; -+ -+ for (i = 0; i < UB_RESOURCES; i++) -+ restore_one_bc_parm(v->cpt_parms, bc->ub_parms, 0); -+ for (i = 0; i < UB_RESOURCES; i++) -+ restore_one_bc_parm(v->cpt_parms + UB_RESOURCES * 6, -+ bc->ub_store, 1); -+ return 0; -+} -+ -+int rst_undump_ubc(struct cpt_context *ctx) -+{ -+ loff_t start, end; -+ struct cpt_beancounter_image *v; -+ cpt_object_t *obj; -+ int err; -+ -+ err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end); -+ if (err) -+ return err; -+ -+ while (start < end) { -+ v = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_UBC, start, v, ctx); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ -+ obj = alloc_cpt_object(GFP_KERNEL, ctx); -+ cpt_obj_setpos(obj, start, ctx); -+ intern_cpt_object(CPT_OBJ_UBC, obj, ctx); -+ -+ restore_one_bc(v, obj, ctx); -+ -+ cpt_release_buf(ctx); -+ start += v->cpt_next; -+ } -+ return 0; -+} -+ -+void rst_finish_ubc(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ -+ for_each_object(obj, CPT_OBJ_UBC) -+ put_beancounter(obj->o_obj); -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_undump.c linux-2.6.16-026test009/kernel/cpt/rst_undump.c ---- linux-2.6.16.orig/kernel/cpt/rst_undump.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_undump.c 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,824 @@ -+/* -+ * -+ * kernel/cpt/rst_undump.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/version.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/errno.h> -+#include <linux/pagemap.h> -+#include <linux/namespace.h> -+#include <linux/personality.h> -+#include <linux/binfmts.h> -+#include <linux/smp_lock.h> -+#include <linux/ve_proto.h> -+#include <linux/virtinfo.h> -+#include <linux/compat.h> -+#include <linux/vzcalluser.h> -+#include <ub/beancounter.h> -+#include <asm/desc.h> -+#include <asm/unistd.h> -+ -+#include "cpt_obj.h" -+#include "cpt_context.h" -+#include "cpt_files.h" -+#include "cpt_mm.h" -+#include "cpt_process.h" -+#include "cpt_socket.h" -+#include "cpt_net.h" -+#include "cpt_ubc.h" -+#include "cpt_kernel.h" -+ -+static int rst_utsname(cpt_context_t *ctx); -+ -+ -+struct thr_context { -+ struct completion init_complete; -+ struct completion task_done; -+ int error; -+ struct cpt_context *ctx; -+ cpt_object_t *tobj; -+}; -+ -+static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx); -+ -+void __put_namespace(struct namespace *namespace) -+{ -+ eprintk("orphan namespace is lost\n"); -+} -+ -+static int vps_rst_veinfo(struct cpt_context *ctx) -+{ -+ int err; -+ struct cpt_veinfo_image *i; -+ struct ve_struct *ve; -+ struct timespec delta; -+ loff_t start, end; -+ -+ err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end); -+ if (err) -+ goto out; -+ -+ i = cpt_get_buf(ctx); -+ err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx); -+ if (err) -+ goto out_rel; -+ -+ ve = get_exec_env(); -+ ve->_shm_ctlall = i->shm_ctl_all; -+ ve->_shm_ctlmax = i->shm_ctl_max; -+ ve->_shm_ctlmni = i->shm_ctl_mni; -+ -+ ve->_msg_ctlmax = i->msg_ctl_max; -+ ve->_msg_ctlmni = i->msg_ctl_mni; -+ ve->_msg_ctlmnb = i->msg_ctl_mnb; -+ -+ BUG_ON(sizeof(ve->_sem_ctls) != sizeof(i->sem_ctl_arr)); -+ ve->_sem_ctls[0] = i->sem_ctl_arr[0]; -+ ve->_sem_ctls[1] = i->sem_ctl_arr[1]; -+ ve->_sem_ctls[2] = i->sem_ctl_arr[2]; -+ ve->_sem_ctls[3] = i->sem_ctl_arr[3]; -+ -+ cpt_timespec_import(&delta, i->start_timespec_delta); -+ _set_normalized_timespec(&ve->start_timespec, -+ ve->start_timespec.tv_sec - delta.tv_sec, -+ ve->start_timespec.tv_nsec - delta.tv_nsec); -+ ve->start_jiffies -= i->start_jiffies_delta; -+ // // FIXME: what??? -+ // // ve->start_cycles -= i->start_jiffies_delta * cycles_per_jiffy; -+ -+ err = 0; -+out_rel: -+ cpt_release_buf(ctx); -+out: -+ return err; -+} -+ -+static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ int err; -+ struct env_create_param2 param; -+ -+ ctx->cpt_jiffies64 = get_jiffies_64(); -+ do_gettimespec(&ctx->delta_time); -+ -+ ctx->delta_time.tv_sec -= ctx->start_time.tv_sec; -+ if (ctx->start_time.tv_nsec > ctx->delta_time.tv_nsec) { -+ ctx->delta_time.tv_sec--; -+ ctx->delta_time.tv_nsec = 1000000000 - (ctx->start_time.tv_nsec - ctx->delta_time.tv_nsec); -+ } else { -+ ctx->delta_time.tv_nsec -= ctx->start_time.tv_nsec; -+ } -+ -+ memset(¶m, 0, sizeof(param)); -+ param.iptables_mask = ctx->iptables_mask; -+ -+ err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2, ¶m, sizeof(param)); -+ if (err < 0) -+ eprintk_ctx("real_env_create: %d\n", err); -+ get_exec_env()->jiffies_fixup = ((ctx->delta_time.tv_sec < 0) ? -+ 0 : timespec_to_jiffies(&ctx->delta_time)) - -+ (unsigned long)(ctx->cpt_jiffies64 - ctx->virt_jiffies64); -+ return err < 0 ? err : 0; -+} -+ -+ -+static int hook(void *arg) -+{ -+ struct thr_context *thr_ctx = arg; -+ struct cpt_context *ctx; -+ cpt_object_t *tobj; -+ struct cpt_task_image *ti; -+ int err = 0; -+ -+ current->state = TASK_UNINTERRUPTIBLE; -+ complete(&thr_ctx->init_complete); -+ schedule(); -+ -+ ctx = thr_ctx->ctx; -+ tobj = thr_ctx->tobj; -+ ti = tobj->o_image; -+ -+ current->fs->umask = 0; -+ -+ if (ti->cpt_pid == 1) { -+ err = vps_rst_reparent_root(tobj, ctx); -+ -+ if (err) { -+ rst_report_error(err, ctx); -+ goto out; -+ } -+ -+ memcpy(&get_exec_env()->cap_default, &ti->cpt_ecap, sizeof(kernel_cap_t)); -+ -+ if (ctx->statusfile) { -+ fput(ctx->statusfile); -+ ctx->statusfile = NULL; -+ } -+ -+ if (ctx->lockfile) { -+ mm_segment_t oldfs; -+ ssize_t err = -EINVAL; -+ char b; -+ -+ oldfs = get_fs(); set_fs(KERNEL_DS); -+ if (ctx->lockfile->f_op && ctx->lockfile->f_op->read) -+ err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos); -+ set_fs(oldfs); -+ fput(ctx->lockfile); -+ ctx->lockfile = NULL; -+ } -+ -+ err = vps_rst_veinfo(ctx); -+ if (err) { -+ eprintk_ctx("rst_veinfo: %d\n", err); -+ goto out; -+ } -+ -+ err = rst_utsname(ctx); -+ if (err) { -+ eprintk_ctx("rst_utsname: %d\n", err); -+ goto out; -+ } -+ -+ err = rst_root_namespace(ctx); -+ if (err) { -+ eprintk_ctx("rst_namespace: %d\n", err); -+ goto out; -+ } -+ -+ if ((err = rst_restore_net(ctx)) != 0) { -+ eprintk_ctx("rst_restore_net: %d\n", err); -+ goto out; -+ } -+ -+ err = rst_sockets(ctx); -+ if (err) { -+ eprintk_ctx("rst_sockets: %d\n", err); -+ goto out; -+ } -+ err = rst_sysv_ipc(ctx); -+ if (err) { -+ eprintk_ctx("rst_sysv_ipc: %d\n", err); -+ goto out; -+ } -+ } -+ -+ do { -+ if (current->user->uid != ti->cpt_user) { -+ struct user_struct *u = alloc_uid(ti->cpt_user); -+ if (!u) { -+ eprintk_ctx("alloc_user\n"); -+ } else { -+ switch_uid(u); -+ } -+ } -+ } while (0); -+ -+ if ((err = rst_mm_complete(ti, ctx)) != 0) { -+ eprintk_ctx("rst_mm: %d\n", err); -+ goto out; -+ } -+ -+ if ((err = rst_files_complete(ti, ctx)) != 0) { -+ eprintk_ctx("rst_files: %d\n", err); -+ goto out; -+ } -+ -+ if ((err = rst_fs_complete(ti, ctx)) != 0) { -+ eprintk_ctx("rst_fs: %d\n", err); -+ goto out; -+ } -+ -+ if ((err = rst_semundo_complete(ti, ctx)) != 0) { -+ eprintk_ctx("rst_semundo: %d\n", err); -+ goto out; -+ } -+ -+ if ((err = rst_signal_complete(ti, ctx)) != 0) { -+ eprintk_ctx("rst_signal: %d\n", err); -+ goto out; -+ } -+ -+ if (ti->cpt_namespace == CPT_NULL) -+ exit_namespace(current); -+ -+ if (ti->cpt_personality != 0) -+ __set_personality(ti->cpt_personality); -+ -+ current->set_child_tid = NULL; -+ current->clear_child_tid = NULL; -+ current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV); -+ current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV); -+ current->exit_code = ti->cpt_exit_code; -+ current->pdeath_signal = ti->cpt_pdeath_signal; -+ -+ if (ti->cpt_restart.fn != CPT_RBL_0) { -+ if (ti->cpt_restart.fn != CPT_RBL_NANOSLEEP -+ && ti->cpt_restart.fn != CPT_RBL_COMPAT_NANOSLEEP -+ ) { -+ eprintk_ctx("unknown restart block\n"); -+ } else { -+ current->thread_info->restart_block.fn = nanosleep_restart; -+#ifdef CONFIG_X86_64 -+ if (!ti->cpt_64bit) -+ current->thread_info->restart_block.fn = compat_nanosleep_restart; -+#endif -+ if (ctx->image_version != 0) { -+ current->thread_info->restart_block.arg0 = ti->cpt_restart.arg0; -+ current->thread_info->restart_block.arg1 = ti->cpt_restart.arg1; -+ current->thread_info->restart_block.arg2 = ti->cpt_restart.arg2; -+ current->thread_info->restart_block.arg3 = ti->cpt_restart.arg3; -+ if (debug_level > 2) { -+ ktime_t e, e1; -+ struct timespec now; -+ -+ do_posix_clock_monotonic_gettime(&now); -+ e = timespec_to_ktime(now); -+ e1.tv64 = ((u64)current->thread_info->restart_block.arg1 << 32) | (u64) current->thread_info->restart_block.arg0; -+ e = ktime_sub(e1, e); -+ dprintk("rst " CPT_FID " RBL %ld/%ld %Ld\n", CPT_TID(current), -+ current->thread_info->restart_block.arg1, -+ current->thread_info->restart_block.arg0, e.tv64); -+ } -+ } else { -+ struct timespec now; -+ ktime_t expire; -+ unsigned long val = ti->cpt_restart.arg0 - -+ timespec_to_jiffies(&ctx->delta_time); -+ if ((long)val <= 0) -+ val = 1; -+ do_posix_clock_monotonic_gettime(&now); -+ expire = ktime_add_ns(timespec_to_ktime(now), (u64)val*TICK_NSEC); -+ current->thread_info->restart_block.arg0 = expire.tv64 & 0xFFFFFFFF; -+ current->thread_info->restart_block.arg1 = expire.tv64 >> 32; -+ current->thread_info->restart_block.arg2 = ti->cpt_restart.arg1; -+ current->thread_info->restart_block.arg3 = CLOCK_MONOTONIC; -+ } -+ } -+ } -+ -+ if (thread_group_leader(current)) { -+ current->signal->it_real_incr.tv64 = 0; -+ if (ctx->image_version != 0) { -+ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr); -+ } else { -+ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC); -+ } -+ current->signal->it_prof_incr = ti->cpt_it_prof_incr; -+ current->signal->it_virt_incr = ti->cpt_it_virt_incr; -+ current->signal->it_prof_expires = ti->cpt_it_prof_value; -+ current->signal->it_virt_expires = ti->cpt_it_virt_value; -+ } -+ -+ err = rst_clone_children(tobj, ctx); -+ if (err) { -+ eprintk_ctx("rst_clone_children\n"); -+ goto out; -+ } -+ -+ if (ti->cpt_pid == 1) { -+ if ((err = rst_process_linkage(ctx)) != 0) { -+ eprintk_ctx("rst_process_linkage: %d\n", err); -+ goto out; -+ } -+ if ((err = rst_do_filejobs(ctx)) != 0) { -+ eprintk_ctx("rst_do_filejobs: %d\n", err); -+ goto out; -+ } -+ if ((err = rst_eventpoll(ctx)) != 0) { -+ eprintk_ctx("rst_eventpoll: %d\n", err); -+ goto out; -+ } -+ if ((err = rst_sockets_complete(ctx)) != 0) { -+ eprintk_ctx("rst_sockets_complete: %d\n", err); -+ goto out; -+ } -+ if ((err = rst_stray_files(ctx)) != 0) { -+ eprintk_ctx("rst_stray_files: %d\n", err); -+ goto out; -+ } -+ if ((err = rst_posix_locks(ctx)) != 0) { -+ eprintk_ctx("rst_posix_locks: %d\n", err); -+ goto out; -+ } -+ if ((err = rst_tty_jobcontrol(ctx)) != 0) { -+ eprintk_ctx("rst_tty_jobcontrol: %d\n", err); -+ goto out; -+ } -+ if ((err = rst_restore_fs(ctx)) != 0) { -+ eprintk_ctx("rst_restore_fs: %d\n", err); -+ goto out; -+ } -+ } -+ -+out: -+ thr_ctx->error = err; -+ lock_kernel(); -+ complete(&thr_ctx->task_done); -+ -+ if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { -+ preempt_disable(); -+ current->exit_state = EXIT_ZOMBIE; -+ write_lock_irq(&tasklist_lock); -+ nr_zombie++; -+ write_unlock_irq(&tasklist_lock); -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) -+ atomic_dec(¤t->signal->live); -+#endif -+ current->flags |= PF_DEAD; -+ if (!(ti->cpt_flags&PF_DEAD)) -+ wprintk_ctx("zombie %d,%d(%s) is not pf_dead\n", current->pid, virt_pid(current), current->comm); -+ module_put(current->thread_info->exec_domain->module); -+ if (current->binfmt) -+ module_put(current->binfmt->module); -+ } else { -+ __set_current_state(TASK_UNINTERRUPTIBLE); -+ } -+ -+ schedule(); -+ -+ dprintk_ctx("leaked through %d/%d %p\n", current->pid, virt_pid(current), current->mm); -+ -+ module_put(THIS_MODULE); -+ complete_and_exit(NULL, 0); -+ return 0; -+} -+ -+#if 0 -+static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx) -+{ -+ struct task_beancounter *tbc; -+ -+ tbc = task_bc(current); -+ -+ put_beancounter(tbc->fork_sub); -+ tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx); -+ if (ti->cpt_mm_ub != CPT_NULL) { -+ put_beancounter(tbc->exec_ub); -+ tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx); -+ } -+} -+#endif -+ -+static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx, -+ struct thr_context *thr_ctx) -+{ -+ task_t *tsk; -+ int pid; -+ -+ thr_ctx->ctx = ctx; -+ thr_ctx->error = 0; -+ init_completion(&thr_ctx->init_complete); -+ init_completion(&thr_ctx->task_done); -+#if 0 -+ set_task_ubs(obj->o_image, ctx); -+#endif -+ -+ pid = local_kernel_thread(hook, thr_ctx, 0, 0); -+ if (pid < 0) -+ return pid; -+ read_lock(&tasklist_lock); -+ tsk = find_task_by_pid_ve(pid); -+ if (tsk) -+ get_task_struct(tsk); -+ read_unlock(&tasklist_lock); -+ if (tsk == NULL) -+ return -ESRCH; -+ cpt_obj_setobj(obj, tsk, ctx); -+ thr_ctx->tobj = obj; -+ return 0; -+} -+ -+static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ task_t *tsk = obj->o_obj; -+ struct cpt_task_image *ti = obj->o_image; -+ -+ memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm)); -+ rst_mm_basic(obj, ti, ctx); -+ return 0; -+} -+ -+static int make_baby(cpt_object_t *cobj, -+ struct cpt_task_image *pi, -+ struct cpt_context *ctx) -+{ -+ unsigned long flags; -+ struct cpt_task_image *ci = cobj->o_image; -+ struct thr_context thr_ctx; -+ task_t *tsk; -+ pid_t pid; -+ -+ flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx) -+ | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx); -+ if (ci->cpt_rppid != pi->cpt_pid) { -+ flags |= CLONE_THREAD|CLONE_PARENT; -+ if (ci->cpt_signal != pi->cpt_signal || -+ !(flags&CLONE_SIGHAND) || -+ (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) { -+ eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n", -+ (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid, -+ ci->cpt_signal, pi->cpt_signal, flags -+ ); -+ return -EINVAL; -+ } -+ } -+ -+ thr_ctx.ctx = ctx; -+ thr_ctx.error = 0; -+ init_completion(&thr_ctx.init_complete); -+ init_completion(&thr_ctx.task_done); -+ thr_ctx.tobj = cobj; -+ -+#if 0 -+ set_task_ubs(ci, ctx); -+#endif -+ -+ pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid); -+ if (pid < 0) -+ return pid; -+ -+ read_lock(&tasklist_lock); -+ tsk = find_task_by_pid_ve(pid); -+ if (tsk) -+ get_task_struct(tsk); -+ read_unlock(&tasklist_lock); -+ if (tsk == NULL) -+ return -ESRCH; -+ cpt_obj_setobj(cobj, tsk, ctx); -+ thr_ctx.tobj = cobj; -+ wait_for_completion(&thr_ctx.init_complete); -+#ifdef CONFIG_SMP -+ wait_task_inactive(cobj->o_obj); -+#endif -+ rst_basic_init_task(cobj, ctx); -+ -+ /* clone() increases group_stop_count if it was not zero and -+ * CLONE_THREAD was asked. Undo. -+ */ -+ if (current->signal->group_stop_count && (flags & CLONE_THREAD)) { -+ if (tsk->signal != current->signal) BUG(); -+ current->signal->group_stop_count--; -+ } -+ -+ wake_up_process(tsk); -+ wait_for_completion(&thr_ctx.task_done); -+ wait_task_inactive(tsk); -+ -+ return thr_ctx.error; -+} -+ -+static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx) -+{ -+ int err = 0; -+ struct cpt_task_image *ti = obj->o_image; -+ cpt_object_t *cobj; -+ -+ for_each_object(cobj, CPT_OBJ_TASK) { -+ struct cpt_task_image *ci = cobj->o_image; -+ if (cobj == obj) -+ continue; -+ if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) || -+ (ci->cpt_leader == ti->cpt_pid && -+ ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) { -+ err = make_baby(cobj, ti, ctx); -+ if (err) { -+ eprintk_ctx("make_baby: %d\n", err); -+ return err; -+ } -+ } -+ } -+ return 0; -+} -+ -+static int read_task_images(struct cpt_context *ctx) -+{ -+ int err; -+ loff_t start, end; -+ -+ err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end); -+ if (err) -+ return err; -+ -+ while (start < end) { -+ cpt_object_t *obj; -+ struct cpt_task_image *ti = cpt_get_buf(ctx); -+ -+ err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx); -+ if (err) { -+ cpt_release_buf(ctx); -+ return err; -+ } -+ if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) { -+ eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid); -+ cpt_release_buf(ctx); -+ return -EINVAL; -+ } -+ obj = alloc_cpt_object(GFP_KERNEL, ctx); -+ cpt_obj_setpos(obj, start, ctx); -+ intern_cpt_object(CPT_OBJ_TASK, obj, ctx); -+ obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL); -+ if (obj->o_image == NULL) { -+ cpt_release_buf(ctx); -+ return -ENOMEM; -+ } -+ memcpy(obj->o_image, ti, sizeof(*ti)); -+ err = ctx->pread(obj->o_image + sizeof(*ti), -+ ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti)); -+ cpt_release_buf(ctx); -+ if (err) -+ return err; -+ start += ti->cpt_next; -+ } -+ return 0; -+} -+ -+ -+static int vps_rst_restore_tree(struct cpt_context *ctx) -+{ -+ int err; -+ cpt_object_t *obj; -+ struct thr_context thr_ctx_root; -+ -+ err = read_task_images(ctx); -+ if (err) -+ return err; -+ -+ err = rst_undump_ubc(ctx); -+ if (err) -+ return err; -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ err = create_root_task(obj, ctx, &thr_ctx_root); -+ if (err) -+ return err; -+ -+ wait_for_completion(&thr_ctx_root.init_complete); -+#ifdef CONFIG_SMP -+ wait_task_inactive(obj->o_obj); -+#endif -+ rst_basic_init_task(obj, ctx); -+ -+ wake_up_process(obj->o_obj); -+ wait_for_completion(&thr_ctx_root.task_done); -+ wait_task_inactive(obj->o_obj); -+ err = thr_ctx_root.error; -+ if (err) -+ return err; -+ break; -+ } -+ -+ return err; -+} -+ -+ -+int vps_rst_undump(struct cpt_context *ctx) -+{ -+ int err; -+ unsigned long umask; -+ -+ err = rst_open_dumpfile(ctx); -+ if (err) -+ return err; -+ -+#ifndef CONFIG_X86_64 -+ if (ctx->tasks64) { -+ eprintk_ctx("Cannot restore 64 bit VE on this architecture\n"); -+ return -EINVAL; -+ } -+#endif -+ -+ umask = current->fs->umask; -+ current->fs->umask = 0; -+ -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ err = rst_setup_pagein(ctx); -+#endif -+ -+ if (err == 0) -+ err = vps_rst_restore_tree(ctx); -+ -+ if (err == 0) -+ err = rst_restore_process(ctx); -+ -+ current->fs->umask = umask; -+ -+ return err; -+} -+ -+static int rst_unlock_ve(struct cpt_context *ctx) -+{ -+ struct ve_struct *env; -+ -+ env = get_ve_by_id(ctx->ve_id); -+ if (!env) -+ return -ESRCH; -+ down_write(&env->op_sem); -+ env->is_locked = 0; -+ up_write(&env->op_sem); -+ put_ve(env); -+ return 0; -+} -+ -+int rst_resume(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ int err = 0; -+ -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file = obj->o_obj; -+ -+ fput(file); -+ } -+ -+ rst_resume_network(ctx); -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ struct cpt_task_image *ti = obj->o_image; -+ -+ if (!tsk) -+ continue; -+ -+ if (ti->cpt_state == TASK_UNINTERRUPTIBLE) { -+ dprintk_ctx("task %d/%d(%s) is started\n", virt_pid(tsk), tsk->pid, tsk->comm); -+ -+ /* Weird... If a signal is sent to stopped task, -+ * nobody makes recalc_sigpending(). We have to do -+ * this by hands after wake_up_process(). -+ * if we did this before a signal could arrive before -+ * wake_up_process() and stall. -+ */ -+ spin_lock_irq(&tsk->sighand->siglock); -+ if (!signal_pending(tsk)) -+ recalc_sigpending_tsk(tsk); -+ spin_unlock_irq(&tsk->sighand->siglock); -+ -+ wake_up_process(tsk); -+ } else { -+ if (ti->cpt_state == TASK_STOPPED || -+ ti->cpt_state == TASK_TRACED) { -+ set_task_state(tsk, ti->cpt_state); -+ } -+ } -+ put_task_struct(tsk); -+ } -+ -+ rst_unlock_ve(ctx); -+ -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ rst_complete_pagein(ctx, 0); -+#endif -+ -+ rst_finish_ubc(ctx); -+ cpt_object_destroy(ctx); -+ -+ return err; -+} -+ -+int rst_kill(struct cpt_context *ctx) -+{ -+ cpt_object_t *obj; -+ int err = 0; -+ -+ for_each_object(obj, CPT_OBJ_FILE) { -+ struct file *file = obj->o_obj; -+ -+ fput(file); -+ } -+ -+ for_each_object(obj, CPT_OBJ_TASK) { -+ task_t *tsk = obj->o_obj; -+ -+ if (tsk == NULL) -+ continue; -+ -+ if (tsk->exit_state == 0) { -+ send_sig(SIGKILL, tsk, 1); -+ -+ spin_lock_irq(&tsk->sighand->siglock); -+ sigfillset(&tsk->blocked); -+ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); -+ set_tsk_thread_flag(tsk, TIF_SIGPENDING); -+ clear_tsk_thread_flag(tsk, TIF_FREEZE); -+ if (tsk->flags & PF_FROZEN) -+ tsk->flags &= ~PF_FROZEN; -+ spin_unlock_irq(&tsk->sighand->siglock); -+ -+ wake_up_process(tsk); -+ } -+ -+ put_task_struct(tsk); -+ } -+ -+#ifdef CONFIG_VZ_CHECKPOINT_LAZY -+ rst_complete_pagein(ctx, 1); -+#endif -+ -+ rst_finish_ubc(ctx); -+ cpt_object_destroy(ctx); -+ -+ return err; -+} -+ -+static int rst_utsname(cpt_context_t *ctx) -+{ -+ int err; -+ loff_t sec = ctx->sections[CPT_SECT_UTSNAME]; -+ loff_t endsec; -+ struct cpt_section_hdr h; -+ struct cpt_object_hdr o; -+ int i; -+ -+ if (sec == CPT_NULL) -+ return 0; -+ -+ err = ctx->pread(&h, sizeof(h), ctx, sec); -+ if (err) -+ return err; -+ if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h)) -+ return -EINVAL; -+ -+ i = 0; -+ endsec = sec + h.cpt_next; -+ sec += h.cpt_hdrlen; -+ while (sec < endsec) { -+ int len; -+ char *ptr; -+ err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx); -+ if (err) -+ return err; -+ len = o.cpt_next - o.cpt_hdrlen; -+ if (len > __NEW_UTS_LEN+1) -+ return -ENAMETOOLONG; -+ switch (i) { -+ case 0: -+ ptr = ve_utsname.nodename; break; -+ case 1: -+ ptr = ve_utsname.domainname; break; -+ default: -+ return -EINVAL; -+ } -+ err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen); -+ if (err) -+ return err; -+ i++; -+ sec += o.cpt_next; -+ } -+ -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/cpt/rst_x8664.S linux-2.6.16-026test009/kernel/cpt/rst_x8664.S ---- linux-2.6.16.orig/kernel/cpt/rst_x8664.S 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpt/rst_x8664.S 2006-04-19 15:02:13.000000000 +0400 -@@ -0,0 +1,61 @@ -+#define ASSEMBLY 1 -+ -+#include <linux/config.h> -+#include <linux/linkage.h> -+#include <asm/segment.h> -+#include <asm/smp.h> -+#include <asm/cache.h> -+#include <asm/errno.h> -+#include <asm/dwarf2.h> -+#include <asm/calling.h> -+#include <asm/msr.h> -+#include <asm/unistd.h> -+#include <asm/thread_info.h> -+#include <asm/hw_irq.h> -+#include <asm/errno.h> -+#include <asm/asm-offsets.h> -+ -+ .code64 -+ .global schedule_tail_hook, schedule_tail_p -+ .align 8 -+schedule_tail_hook: -+ movq schedule_tail_p(%rip),%r11 -+ call *%r11 -+ GET_THREAD_INFO(%rcx) -+ btr $22,threadinfo_flags(%rcx) /* TIF_RESUME */ -+ jc 1f -+ retq -+ -+ /* If TIF_RESUME is set, (%rsp) is pointer to hook function -+ * the hook will do the work and jump to the next hook, -+ * everything should end at ret_from_fork+5. -+ */ -+1: addq $8,%rsp -+ retq -+ -+ .align 8 -+ .global ret_from_fork2 -+ret_from_fork2: -+ cmpq $0,ORIG_RAX(%rsp) -+ jge ret_from_fork+5 -+ RESTORE_REST -+ jmp int_ret_from_sys_call -+ -+ .align 8 -+ .global ret_last_siginfo -+ret_last_siginfo: -+ call rlsi -+ movq %rax,%rsp -+ retq -+ -+ .align 8 -+ .global ret_child_tid -+ret_child_tid: -+ movq %rsp,%rdi -+ call rct -+ movq %rax,%rsp -+ retq -+ -+ .data -+schedule_tail_p: -+ .quad 0 -diff -upr linux-2.6.16.orig/kernel/cpu.c linux-2.6.16-026test009/kernel/cpu.c ---- linux-2.6.16.orig/kernel/cpu.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpu.c 2006-04-19 15:02:12.000000000 +0400 -@@ -95,7 +95,7 @@ static inline void check_for_tasks(int c - struct task_struct *p; - - write_lock_irq(&tasklist_lock); -- for_each_process(p) { -+ for_each_process_all(p) { - if (task_cpu(p) == cpu && - (!cputime_eq(p->utime, cputime_zero) || - !cputime_eq(p->stime, cputime_zero))) -diff -upr linux-2.6.16.orig/kernel/cpuset.c linux-2.6.16-026test009/kernel/cpuset.c ---- linux-2.6.16.orig/kernel/cpuset.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/cpuset.c 2006-04-19 15:02:12.000000000 +0400 -@@ -897,7 +897,7 @@ static int update_nodemask(struct cpuset - n = 0; - - /* Load up mmarray[] with mm reference for each task in cpuset. */ -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - struct mm_struct *mm; - - if (n >= ntasks) { -@@ -911,7 +911,7 @@ static int update_nodemask(struct cpuset - if (!mm) - continue; - mmarray[n++] = mm; -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - write_unlock_irq(&tasklist_lock); - - /* -@@ -1125,7 +1125,7 @@ static int attach_task(struct cpuset *cs - if (pid) { - read_lock(&tasklist_lock); - -- tsk = find_task_by_pid(pid); -+ tsk = find_task_by_pid_all(pid); - if (!tsk || tsk->flags & PF_EXITING) { - read_unlock(&tasklist_lock); - return -ESRCH; -@@ -1561,13 +1561,13 @@ static int pid_array_load(pid_t *pidarra - - read_lock(&tasklist_lock); - -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - if (p->cpuset == cs) { - pidarray[n++] = p->pid; - if (unlikely(n == npids)) - goto array_full; - } -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - - array_full: - read_unlock(&tasklist_lock); -diff -upr linux-2.6.16.orig/kernel/exec_domain.c linux-2.6.16-026test009/kernel/exec_domain.c ---- linux-2.6.16.orig/kernel/exec_domain.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/exec_domain.c 2006-04-19 15:02:11.000000000 +0400 -@@ -140,6 +140,7 @@ __set_personality(u_long personality) - ep = lookup_exec_domain(personality); - if (ep == current_thread_info()->exec_domain) { - current->personality = personality; -+ module_put(ep->module); - return 0; - } - -diff -upr linux-2.6.16.orig/kernel/exit.c linux-2.6.16-026test009/kernel/exit.c ---- linux-2.6.16.orig/kernel/exit.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/exit.c 2006-04-19 15:02:12.000000000 +0400 -@@ -42,7 +42,7 @@ extern struct task_struct *child_reaper; - - int getrusage(struct task_struct *, int, struct rusage __user *); - --static void exit_mm(struct task_struct * tsk); -+void exit_mm(struct task_struct * tsk); - - static void __unhash_process(struct task_struct *p) - { -@@ -57,18 +57,19 @@ static void __unhash_process(struct task - } - - REMOVE_LINKS(p); -+ REMOVE_VE_LINKS(p); - } - - void release_task(struct task_struct * p) - { - int zap_leader; - task_t *leader; -- struct dentry *proc_dentry; -+ struct dentry *proc_dentry[2]; - - repeat: - atomic_dec(&p->user->processes); - spin_lock(&p->proc_lock); -- proc_dentry = proc_pid_unhash(p); -+ proc_pid_unhash(p, proc_dentry); - write_lock_irq(&tasklist_lock); - if (unlikely(p->ptrace)) - __ptrace_unlink(p); -@@ -80,6 +81,8 @@ repeat: - * the process by __unhash_process. - */ - __unhash_process(p); -+ nr_zombie--; -+ atomic_inc(&nr_dead); - - /* - * If we are the last non-leader member of the thread -@@ -107,6 +110,10 @@ repeat: - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); - release_thread(p); -+#ifdef CONFIG_VE -+ if (atomic_dec_and_test(&VE_TASK_INFO(p)->owner_env->pcounter)) -+ do_env_cleanup(VE_TASK_INFO(p)->owner_env); -+#endif - put_task_struct(p); - - p = leader; -@@ -118,10 +125,10 @@ repeat: - - void unhash_process(struct task_struct *p) - { -- struct dentry *proc_dentry; -+ struct dentry *proc_dentry[2]; - - spin_lock(&p->proc_lock); -- proc_dentry = proc_pid_unhash(p); -+ proc_pid_unhash(p, proc_dentry); - write_lock_irq(&tasklist_lock); - __unhash_process(p); - write_unlock_irq(&tasklist_lock); -@@ -139,14 +146,16 @@ int session_of_pgrp(int pgrp) - struct task_struct *p; - int sid = -1; - -+ WARN_ON(is_virtual_pid(pgrp)); -+ - read_lock(&tasklist_lock); -- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { -+ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { - if (p->signal->session > 0) { - sid = p->signal->session; - goto out; - } -- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); -- p = find_task_by_pid(pgrp); -+ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); -+ p = find_task_by_pid_ve(pgrp); - if (p) - sid = p->signal->session; - out: -@@ -168,17 +177,19 @@ static int will_become_orphaned_pgrp(int - struct task_struct *p; - int ret = 1; - -- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { -+ WARN_ON(is_virtual_pid(pgrp)); -+ -+ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { - if (p == ignored_task - || p->exit_state -- || p->real_parent->pid == 1) -+ || virt_pid(p->real_parent) == 1) - continue; - if (process_group(p->real_parent) != pgrp - && p->real_parent->signal->session == p->signal->session) { - ret = 0; - break; - } -- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); -+ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); - return ret; /* (sighing) "Often!" */ - } - -@@ -186,6 +197,8 @@ int is_orphaned_pgrp(int pgrp) - { - int retval; - -+ WARN_ON(is_virtual_pid(pgrp)); -+ - read_lock(&tasklist_lock); - retval = will_become_orphaned_pgrp(pgrp, NULL); - read_unlock(&tasklist_lock); -@@ -198,7 +211,7 @@ static int has_stopped_jobs(int pgrp) - int retval = 0; - struct task_struct *p; - -- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { -+ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { - if (p->state != TASK_STOPPED) - continue; - -@@ -214,7 +227,7 @@ static int has_stopped_jobs(int pgrp) - - retval = 1; - break; -- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); -+ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); - return retval; - } - -@@ -263,6 +276,9 @@ void __set_special_pids(pid_t session, p - { - struct task_struct *curr = current->group_leader; - -+ WARN_ON(is_virtual_pid(pgrp)); -+ WARN_ON(is_virtual_pid(session)); -+ - if (curr->signal->session != session) { - detach_pid(curr, PIDTYPE_SID); - curr->signal->session = session; -@@ -281,6 +297,7 @@ void set_special_pids(pid_t session, pid - __set_special_pids(session, pgrp); - write_unlock_irq(&tasklist_lock); - } -+EXPORT_SYMBOL(set_special_pids); - - /* - * Let kernel threads use this to say that they -@@ -500,7 +517,7 @@ EXPORT_SYMBOL_GPL(exit_fs); - * Turn us into a lazy TLB process if we - * aren't already.. - */ --static void exit_mm(struct task_struct * tsk) -+void exit_mm(struct task_struct * tsk) - { - struct mm_struct *mm = tsk->mm; - -@@ -535,6 +552,7 @@ static void exit_mm(struct task_struct * - task_unlock(tsk); - mmput(mm); - } -+EXPORT_SYMBOL_GPL(exit_mm); - - static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) - { -@@ -613,13 +631,12 @@ static void reparent_thread(task_t *p, t - static void forget_original_parent(struct task_struct * father, - struct list_head *to_release) - { -- struct task_struct *p, *reaper = father; -+ struct task_struct *p, *tsk_reaper, *reaper = father; - struct list_head *_p, *_n; - - do { - reaper = next_thread(reaper); - if (reaper == father) { -- reaper = child_reaper; - break; - } - } while (reaper->exit_state); -@@ -641,9 +658,16 @@ static void forget_original_parent(struc - /* if father isn't the real parent, then ptrace must be enabled */ - BUG_ON(father != p->real_parent && !ptrace); - -+ tsk_reaper = reaper; -+ if (tsk_reaper == father) -+#ifdef CONFIG_VE -+ tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry; -+ if (tsk_reaper == p) -+#endif -+ tsk_reaper = child_reaper; - if (father == p->real_parent) { -- /* reparent with a reaper, real father it's us */ -- choose_new_parent(p, reaper, child_reaper); -+ /* reparent with a tsk_reaper, real father it's us */ -+ choose_new_parent(p, tsk_reaper, child_reaper); - reparent_thread(p, father, 0); - } else { - /* reparent ptraced task to its real parent */ -@@ -664,7 +688,15 @@ static void forget_original_parent(struc - } - list_for_each_safe(_p, _n, &father->ptrace_children) { - p = list_entry(_p,struct task_struct,ptrace_list); -- choose_new_parent(p, reaper, child_reaper); -+ -+ tsk_reaper = reaper; -+ if (tsk_reaper == father) -+#ifdef CONFIG_VE -+ tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry; -+ if (tsk_reaper == p) -+#endif -+ tsk_reaper = child_reaper; -+ choose_new_parent(p, tsk_reaper, child_reaper); - reparent_thread(p, father, 1); - } - } -@@ -760,6 +792,9 @@ static void exit_notify(struct task_stru - && !capable(CAP_KILL)) - tsk->exit_signal = SIGCHLD; - -+ if (tsk->exit_signal != -1 && t == child_reaper) -+ /* We dont want people slaying init. */ -+ tsk->exit_signal = SIGCHLD; - - /* If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal -@@ -778,6 +813,7 @@ static void exit_notify(struct task_stru - unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) - state = EXIT_DEAD; - tsk->exit_state = state; -+ nr_zombie++; - - write_unlock_irq(&tasklist_lock); - -@@ -792,6 +828,82 @@ static void exit_notify(struct task_stru - release_task(tsk); - } - -+#ifdef CONFIG_VE -+/* -+ * Handle exitting of init process, it's a special case for VE. -+ */ -+static void do_initproc_exit(void) -+{ -+ struct task_struct *tsk; -+ struct ve_struct *env; -+ struct siginfo info; -+ struct task_struct *g, *p; -+ long delay = 1L; -+ -+ tsk = current; -+ env = VE_TASK_INFO(current)->owner_env; -+ if (env->init_entry != tsk) -+ return; -+ -+ if (ve_is_super(env) && tsk->pid == 1) -+ panic("Attempted to kill init!"); -+ -+ memset(&info, 0, sizeof(info)); -+ info.si_errno = 0; -+ info.si_code = SI_KERNEL; -+ info.si_pid = virt_pid(tsk); -+ info.si_uid = current->uid; -+ info.si_signo = SIGKILL; -+ -+ /* -+ * Here the VE changes its state into "not running". -+ * op_sem taken for write is a barrier to all VE manipulations from -+ * ioctl: it waits for operations currently in progress and blocks all -+ * subsequent operations until is_running is set to 0 and op_sem is -+ * released. -+ */ -+ down_write(&env->op_sem); -+ env->is_running = 0; -+ up_write(&env->op_sem); -+ -+ /* send kill to all processes of VE */ -+ read_lock(&tasklist_lock); -+ do_each_thread_ve(g, p) { -+ force_sig_info(SIGKILL, &info, p); -+ } while_each_thread_ve(g, p); -+ read_unlock(&tasklist_lock); -+ -+ /* wait for all init childs exit */ -+ while (atomic_read(&env->pcounter) > 1) { -+ if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0) -+ continue; -+ /* it was ENOCHLD or no more children somehow */ -+ if (atomic_read(&env->pcounter) == 1) -+ break; -+ -+ /* clear all signals to avoid wakeups */ -+ if (signal_pending(tsk)) -+ flush_signals(tsk); -+ /* we have child without signal sent */ -+ __set_current_state(TASK_INTERRUPTIBLE); -+ schedule_timeout(delay); -+ delay = (delay < HZ) ? (delay << 1) : HZ; -+ read_lock(&tasklist_lock); -+ do_each_thread_ve(g, p) { -+ if (p != tsk) -+ force_sig_info(SIGKILL, &info, p); -+ } while_each_thread_ve(g, p); -+ read_unlock(&tasklist_lock); -+ } -+ env->init_entry = child_reaper; -+ write_lock_irq(&tasklist_lock); -+ REMOVE_LINKS(tsk); -+ tsk->parent = tsk->real_parent = child_reaper; -+ SET_LINKS(tsk); -+ write_unlock_irq(&tasklist_lock); -+} -+#endif -+ - fastcall NORET_TYPE void do_exit(long code) - { - struct task_struct *tsk = current; -@@ -805,14 +917,20 @@ fastcall NORET_TYPE void do_exit(long co - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); -+#ifdef CONFIG_VE -+ do_initproc_exit(); -+#else - if (unlikely(tsk->pid == 1)) - panic("Attempted to kill init!"); -+#endif - if (tsk->io_context) - exit_io_context(); - - if (unlikely(current->ptrace & PT_TRACE_EXIT)) { - current->ptrace_message = code; -+ set_pn_state(current, PN_STOP_EXIT); - ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); -+ clear_pn_state(current); - } - - /* -@@ -911,7 +1029,14 @@ asmlinkage long sys_exit(int error_code) - - task_t fastcall *next_thread(const task_t *p) - { -- return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); -+ task_t *tsk; -+ -+ tsk = pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); -+#ifdef CONFIG_VE -+ /* all threads should belong to ONE ve! */ -+ BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env); -+#endif -+ return tsk; - } - - EXPORT_SYMBOL(next_thread); -@@ -960,14 +1085,19 @@ asmlinkage void sys_exit_group(int error - static int eligible_child(pid_t pid, int options, task_t *p) - { - if (pid > 0) { -- if (p->pid != pid) -+ if ((is_virtual_pid(pid) ? virt_pid(p) : p->pid) != pid) - return 0; - } else if (!pid) { - if (process_group(p) != process_group(current)) - return 0; - } else if (pid != -1) { -- if (process_group(p) != -pid) -- return 0; -+ if (__is_virtual_pid(-pid)) { -+ if (virt_pgid(p) != -pid) -+ return 0; -+ } else { -+ if (process_group(p) != -pid) -+ return 0; -+ } - } - - /* -@@ -1157,7 +1287,7 @@ static int wait_task_zombie(task_t *p, i - p->exit_state = EXIT_ZOMBIE; - return retval; - } -- retval = p->pid; -+ retval = get_task_pid(p); - if (p->real_parent != p->parent) { - write_lock_irq(&tasklist_lock); - /* Double-check with lock held. */ -@@ -1292,7 +1422,7 @@ bail_ref: - if (!retval && infop) - retval = put_user(p->uid, &infop->si_uid); - if (!retval) -- retval = p->pid; -+ retval = get_task_pid(p); - put_task_struct(p); - - BUG_ON(!retval); -@@ -1574,6 +1704,7 @@ asmlinkage long sys_wait4(pid_t pid, int - prevent_tail_call(ret); - return ret; - } -+EXPORT_SYMBOL_GPL(sys_wait4); - - #ifdef __ARCH_WANT_SYS_WAITPID - -diff -upr linux-2.6.16.orig/kernel/fork.c linux-2.6.16-026test009/kernel/fork.c ---- linux-2.6.16.orig/kernel/fork.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/fork.c 2006-04-19 15:02:12.000000000 +0400 -@@ -20,6 +20,7 @@ - #include <linux/vmalloc.h> - #include <linux/completion.h> - #include <linux/namespace.h> -+#include <linux/file.h> - #include <linux/personality.h> - #include <linux/mempolicy.h> - #include <linux/sem.h> -@@ -52,11 +53,15 @@ - #include <asm/cacheflush.h> - #include <asm/tlbflush.h> - -+#include <ub/ub_vmpages.h> -+#include <ub/ub_misc.h> -+ - /* - * Protected counters by write_lock_irq(&tasklist_lock) - */ - unsigned long total_forks; /* Handle normal Linux uptimes. */ - int nr_threads; /* The idle threads do not count.. */ -+EXPORT_SYMBOL(nr_threads); - - int max_threads; /* tunable limit on nr_threads */ - -@@ -103,6 +108,7 @@ static kmem_cache_t *mm_cachep; - - void free_task(struct task_struct *tsk) - { -+ ub_task_uncharge(tsk); - free_thread_info(tsk->thread_info); - free_task_struct(tsk); - } -@@ -122,9 +128,14 @@ void __put_task_struct_cb(struct rcu_hea - free_uid(tsk->user); - put_group_info(tsk->group_info); - -+#ifdef CONFIG_VE -+ put_ve(VE_TASK_INFO(tsk)->owner_env); -+ atomic_dec(&nr_dead); -+#endif - if (!profile_handoff_task(tsk)) - free_task(tsk); - } -+EXPORT_SYMBOL_GPL(__put_task_struct_cb); - - void __init fork_init(unsigned long mempages) - { -@@ -135,7 +146,7 @@ void __init fork_init(unsigned long memp - /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", sizeof(struct task_struct), -- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); -+ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_UBC, NULL, NULL); - #endif - - /* -@@ -166,22 +177,30 @@ static struct task_struct *dup_task_stru - - tsk = alloc_task_struct(); - if (!tsk) -- return NULL; -+ goto out; - - ti = alloc_thread_info(tsk); -- if (!ti) { -- free_task_struct(tsk); -- return NULL; -- } -+ if (!ti) -+ goto out_tsk; - - *tsk = *orig; - tsk->thread_info = ti; - setup_thread_stack(tsk, orig); - -+ if (ub_task_charge(orig, tsk)) -+ goto out_ti; -+ - /* One for us, one for whoever does the "release_task()" (usually parent) */ - atomic_set(&tsk->usage,2); - atomic_set(&tsk->fs_excl, 0); - return tsk; -+ -+out_ti: -+ free_thread_info(ti); -+out_tsk: -+ free_task_struct(tsk); -+out: -+ return NULL; - } - - #ifdef CONFIG_MMU -@@ -219,7 +238,12 @@ static inline int dup_mmap(struct mm_str - -pages); - continue; - } -+ - charge = 0; -+ if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start, -+ mpnt->vm_flags & ~VM_LOCKED, -+ mpnt->vm_file, UB_HARD)) -+ goto fail_noch; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - if (security_vm_enough_memory(len)) -@@ -238,6 +262,7 @@ static inline int dup_mmap(struct mm_str - tmp->vm_flags &= ~VM_LOCKED; - tmp->vm_mm = mm; - tmp->vm_next = NULL; -+ set_vma_rss(tmp, 0); - anon_vma_link(tmp); - file = tmp->vm_file; - if (file) { -@@ -266,7 +291,7 @@ static inline int dup_mmap(struct mm_str - rb_parent = &tmp->vm_rb; - - mm->map_count++; -- retval = copy_page_range(mm, oldmm, mpnt); -+ retval = copy_page_range(mm, oldmm, tmp, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); -@@ -283,6 +308,9 @@ out: - fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); - fail_nomem: -+ ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start, -+ mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file); -+fail_noch: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -@@ -313,7 +341,8 @@ static inline void mm_free_pgd(struct mm - - #include <linux/init_task.h> - --static struct mm_struct * mm_init(struct mm_struct * mm) -+static struct mm_struct * mm_init(struct mm_struct * mm, -+ struct task_struct *tsk) - { - atomic_set(&mm->mm_users, 1); - atomic_set(&mm->mm_count, 1); -@@ -328,11 +357,14 @@ static struct mm_struct * mm_init(struct - mm->ioctx_list = NULL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; -+ set_mm_ub(mm, tsk); - - if (likely(!mm_alloc_pgd(mm))) { - mm->def_flags = 0; - return mm; - } -+ -+ put_mm_ub(mm); - free_mm(mm); - return NULL; - } -@@ -347,10 +379,11 @@ struct mm_struct * mm_alloc(void) - mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); -- mm = mm_init(mm); -+ mm = mm_init(mm, NULL); - } - return mm; - } -+EXPORT_SYMBOL_GPL(mm_alloc); - - /* - * Called when the last reference to the mm -@@ -362,8 +395,10 @@ void fastcall __mmdrop(struct mm_struct - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); -+ put_mm_ub(mm); - free_mm(mm); - } -+EXPORT_SYMBOL_GPL(__mmdrop); - - /* - * Decrement the use count and release all resources for an mm. -@@ -466,7 +501,7 @@ static struct mm_struct *dup_mm(struct t - - memcpy(mm, oldmm, sizeof(*mm)); - -- if (!mm_init(mm)) -+ if (!mm_init(mm, tsk)) - goto fail_nomem; - - if (init_new_context(tsk, mm)) -@@ -720,7 +755,7 @@ out_release: - free_fdset (new_fdt->open_fds, new_fdt->max_fdset); - free_fd_array(new_fdt->fd, new_fdt->max_fds); - kmem_cache_free(files_cachep, newf); -- goto out; -+ return NULL; - } - - static int copy_files(unsigned long clone_flags, struct task_struct * tsk) -@@ -896,7 +931,7 @@ asmlinkage long sys_set_tid_address(int - { - current->clear_child_tid = tidptr; - -- return current->pid; -+ return virt_pid(current); - } - - /* -@@ -913,7 +948,7 @@ static task_t *copy_process(unsigned lon - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, -- int pid) -+ int pid, long pid0) - { - int retval; - struct task_struct *p = NULL; -@@ -974,12 +1009,20 @@ static task_t *copy_process(unsigned lon - p->did_exec = 0; - copy_flags(clone_flags, p); - p->pid = pid; -+#ifdef CONFIG_VE -+ set_virt_pid(p, alloc_vpid(p->pid, pid0 ? : -1)); -+ if (virt_pid(p) < 0) -+ goto bad_fork_cleanup_module; -+#endif - retval = -EFAULT; - if (clone_flags & CLONE_PARENT_SETTID) -- if (put_user(p->pid, parent_tidptr)) -+ if (put_user(virt_pid(p), parent_tidptr)) - goto bad_fork_cleanup; - - p->proc_dentry = NULL; -+#ifdef CONFIG_VE -+ p->ve_task_info.glob_proc_dentry = NULL; -+#endif - - INIT_LIST_HEAD(&p->children); - INIT_LIST_HEAD(&p->sibling); -@@ -1027,8 +1070,13 @@ static task_t *copy_process(unsigned lon - #endif - - p->tgid = p->pid; -- if (clone_flags & CLONE_THREAD) -+ set_virt_tgid(p, virt_pid(p)); -+ set_virt_pgid(p, virt_pgid(current)); -+ set_virt_sid(p, virt_sid(current)); -+ if (clone_flags & CLONE_THREAD) { - p->tgid = current->tgid; -+ set_virt_tgid(p, virt_tgid(current)); -+ } - - if ((retval = security_task_alloc(p))) - goto bad_fork_cleanup_policy; -@@ -1181,6 +1229,12 @@ static task_t *copy_process(unsigned lon - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); - -+#ifdef CONFIG_VE -+ SET_VE_LINKS(p); -+ atomic_inc(&p->ve_task_info.owner_env->pcounter); -+ get_ve(p->ve_task_info.owner_env); -+ seqcount_init(&p->ve_task_info.wakeup_lock); -+#endif - if (thread_group_leader(p)) { - p->signal->tty = current->signal->tty; - p->signal->pgrp = process_group(current); -@@ -1228,6 +1282,11 @@ bad_fork_cleanup_cpuset: - #endif - cpuset_exit(p); - bad_fork_cleanup: -+#ifdef CONFIG_VE -+ if (virt_pid(p) != p->pid && virt_pid(p) > 0) -+ free_vpid(virt_pid(p), get_exec_env()); -+bad_fork_cleanup_module: -+#endif - if (p->binfmt) - module_put(p->binfmt->module); - bad_fork_cleanup_put_domain: -@@ -1253,7 +1312,7 @@ task_t * __devinit fork_idle(int cpu) - task_t *task; - struct pt_regs regs; - -- task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); -+ task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0, 0); - if (!task) - return ERR_PTR(-ENOMEM); - init_idle(task, cpu); -@@ -1283,12 +1342,13 @@ static inline int fork_traceflag (unsign - * It copies the process, and if successful kick-starts - * it and waits for it to finish using the VM if required. - */ --long do_fork(unsigned long clone_flags, -+long do_fork_pid(unsigned long clone_flags, - unsigned long stack_start, - struct pt_regs *regs, - unsigned long stack_size, - int __user *parent_tidptr, -- int __user *child_tidptr) -+ int __user *child_tidptr, -+ long pid0) - { - struct task_struct *p; - int trace = 0; -@@ -1302,7 +1362,8 @@ long do_fork(unsigned long clone_flags, - clone_flags |= CLONE_PTRACE; - } - -- p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); -+ p = copy_process(clone_flags, stack_start, regs, stack_size, -+ parent_tidptr, child_tidptr, pid, pid0); - /* - * Do this prior waking up the new thread - the thread pointer - * might get invalid after that point, if the thread exits quickly. -@@ -1310,6 +1371,7 @@ long do_fork(unsigned long clone_flags, - if (!IS_ERR(p)) { - struct completion vfork; - -+ pid = virt_pid(p); - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; - init_completion(&vfork); -@@ -1330,13 +1392,18 @@ long do_fork(unsigned long clone_flags, - - if (unlikely (trace)) { - current->ptrace_message = pid; -+ set_pn_state(current, PN_STOP_FORK); - ptrace_notify ((trace << 8) | SIGTRAP); -+ clear_pn_state(current); - } - - if (clone_flags & CLONE_VFORK) { - wait_for_completion(&vfork); -- if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) -+ if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { -+ set_pn_state(current, PN_STOP_VFORK); - ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); -+ clear_pn_state(current); -+ } - } - } else { - free_pidmap(pid); -@@ -1349,26 +1416,39 @@ long do_fork(unsigned long clone_flags, - #define ARCH_MIN_MMSTRUCT_ALIGN 0 - #endif - -+EXPORT_SYMBOL(do_fork_pid); -+ -+long do_fork(unsigned long clone_flags, -+ unsigned long stack_start, -+ struct pt_regs *regs, -+ unsigned long stack_size, -+ int __user *parent_tidptr, -+ int __user *child_tidptr) -+{ -+ return do_fork_pid(clone_flags, stack_start, regs, stack_size, -+ parent_tidptr, child_tidptr, 0); -+} -+ - void __init proc_caches_init(void) - { - sighand_cachep = kmem_cache_create("sighand_cache", - sizeof(struct sighand_struct), 0, -- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); -+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); - signal_cachep = kmem_cache_create("signal_cache", - sizeof(struct signal_struct), 0, -- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); -+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); - files_cachep = kmem_cache_create("files_cache", - sizeof(struct files_struct), 0, -- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); -+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); - fs_cachep = kmem_cache_create("fs_cache", - sizeof(struct fs_struct), 0, -- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); -+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL, NULL); - mm_cachep = kmem_cache_create("mm_struct", - sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, -- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); -+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); - } - - -diff -upr linux-2.6.16.orig/kernel/hrtimer.c linux-2.6.16-026test009/kernel/hrtimer.c ---- linux-2.6.16.orig/kernel/hrtimer.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/hrtimer.c 2006-04-19 15:02:12.000000000 +0400 -@@ -439,6 +439,7 @@ hrtimer_start(struct hrtimer *timer, kti - - return ret; - } -+EXPORT_SYMBOL_GPL(hrtimer_start); - - /** - * hrtimer_try_to_cancel - try to deactivate a timer -@@ -467,6 +468,7 @@ int hrtimer_try_to_cancel(struct hrtimer - return ret; - - } -+EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); - - /** - * hrtimer_cancel - cancel a timer and wait for the handler to finish. -@@ -504,6 +506,7 @@ ktime_t hrtimer_get_remaining(const stru - - return rem; - } -+EXPORT_SYMBOL_GPL(hrtimer_get_remaining); - - #ifdef CONFIG_NO_IDLE_HZ - /** -@@ -670,7 +673,7 @@ void hrtimer_run_queues(void) - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - */ --static ktime_t __sched -+ktime_t __sched - schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode) - { - /* fn stays NULL, meaning single-shot wakeup: */ -@@ -697,7 +700,7 @@ schedule_hrtimer_interruptible(struct hr - return schedule_hrtimer(timer, mode); - } - --static long __sched nanosleep_restart(struct restart_block *restart) -+long __sched nanosleep_restart(struct restart_block *restart) - { - struct timespec __user *rmtp; - struct timespec tu; -@@ -726,6 +729,7 @@ static long __sched nanosleep_restart(st - /* The other values in restart are already filled in */ - return -ERESTART_RESTARTBLOCK; - } -+EXPORT_SYMBOL_GPL(nanosleep_restart); - - long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, - const enum hrtimer_mode mode, const clockid_t clockid) -diff -upr linux-2.6.16.orig/kernel/irq/handle.c linux-2.6.16-026test009/kernel/irq/handle.c ---- linux-2.6.16.orig/kernel/irq/handle.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/irq/handle.c 2006-04-19 15:02:12.000000000 +0400 -@@ -14,6 +14,8 @@ - - #include "internals.h" - -+#include <ub/beancounter.h> -+ - /* - * Linux has a controller-independent interrupt architecture. - * Every controller has a 'controller-template', that is used -@@ -80,10 +82,12 @@ fastcall int handle_IRQ_event(unsigned i - struct irqaction *action) - { - int ret, retval = 0, status = 0; -+ struct user_beancounter *ub; - - if (!(action->flags & SA_INTERRUPT)) - local_irq_enable(); - -+ ub = set_exec_ub(get_ub0()); - do { - ret = action->handler(irq, action->dev_id, regs); - if (ret == IRQ_HANDLED) -@@ -91,6 +95,7 @@ fastcall int handle_IRQ_event(unsigned i - retval |= ret; - action = action->next; - } while (action); -+ (void)set_exec_ub(ub); - - if (status & SA_SAMPLE_RANDOM) - add_interrupt_randomness(irq); -diff -upr linux-2.6.16.orig/kernel/kmod.c linux-2.6.16-026test009/kernel/kmod.c ---- linux-2.6.16.orig/kernel/kmod.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/kmod.c 2006-04-19 15:02:12.000000000 +0400 -@@ -78,6 +78,10 @@ int request_module(const char *fmt, ...) - #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ - static int kmod_loop_msg; - -+ /* Don't allow request_module() inside VE. */ -+ if (!ve_is_super(get_exec_env())) -+ return -EPERM; -+ - va_start(args, fmt); - ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); - va_end(args); -@@ -246,6 +250,9 @@ int call_usermodehelper_keys(char *path, - }; - DECLARE_WORK(work, __call_usermodehelper, &sub_info); - -+ if (!ve_is_super(get_exec_env())) -+ return -EPERM; -+ - if (!khelper_wq) - return -EBUSY; - -diff -upr linux-2.6.16.orig/kernel/kthread.c linux-2.6.16-026test009/kernel/kthread.c ---- linux-2.6.16.orig/kernel/kthread.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/kthread.c 2006-04-19 15:02:12.000000000 +0400 -@@ -114,7 +114,7 @@ static void keventd_create_kthread(void - create->result = ERR_PTR(pid); - } else { - wait_for_completion(&create->started); -- create->result = find_task_by_pid(pid); -+ create->result = find_task_by_pid_all(pid); - } - complete(&create->done); - } -diff -upr linux-2.6.16.orig/kernel/module.c linux-2.6.16-026test009/kernel/module.c ---- linux-2.6.16.orig/kernel/module.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/module.c 2006-04-19 15:02:12.000000000 +0400 -@@ -2130,6 +2130,8 @@ static void *m_start(struct seq_file *m, - loff_t n = 0; - - down(&module_mutex); -+ if (!ve_is_super(get_exec_env())) -+ return NULL; - list_for_each(i, &modules) { - if (n++ == *pos) - break; -diff -upr linux-2.6.16.orig/kernel/mutex-debug.c linux-2.6.16-026test009/kernel/mutex-debug.c ---- linux-2.6.16.orig/kernel/mutex-debug.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/mutex-debug.c 2006-04-19 15:02:12.000000000 +0400 -@@ -193,12 +193,12 @@ retry: - if (count != 10) - printk(" locked it.\n"); - -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - show_task_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - - printk("\n"); - show_held_locks(NULL); -diff -upr linux-2.6.16.orig/kernel/panic.c linux-2.6.16-026test009/kernel/panic.c ---- linux-2.6.16.orig/kernel/panic.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/panic.c 2006-04-19 15:02:12.000000000 +0400 -@@ -23,6 +23,8 @@ - int panic_timeout; - int panic_on_oops; - int tainted; -+int kernel_text_csum_broken; -+EXPORT_SYMBOL(kernel_text_csum_broken); - - EXPORT_SYMBOL(panic_timeout); - -@@ -156,7 +158,8 @@ const char *print_tainted(void) - { - static char buf[20]; - if (tainted) { -- snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", -+ snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", -+ kernel_text_csum_broken ? 'B' : ' ', - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', -diff -upr linux-2.6.16.orig/kernel/pid.c linux-2.6.16-026test009/kernel/pid.c ---- linux-2.6.16.orig/kernel/pid.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/pid.c 2006-04-19 15:02:12.000000000 +0400 -@@ -27,6 +27,10 @@ - #include <linux/bootmem.h> - #include <linux/hash.h> - -+#ifdef CONFIG_VE -+static void __free_vpid(int vpid, struct ve_struct *ve); -+#endif -+ - #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) - static struct hlist_head *pid_hash[PIDTYPE_MAX]; - static int pidhash_shift; -@@ -57,8 +61,14 @@ typedef struct pidmap { - void *page; - } pidmap_t; - -+#ifdef CONFIG_VE -+#define PIDMAP_NRFREE (BITS_PER_PAGE/2) -+#else -+#define PIDMAP_NRFREE BITS_PER_PAGE -+#endif -+ - static pidmap_t pidmap_array[PIDMAP_ENTRIES] = -- { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; -+ { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(PIDMAP_NRFREE), NULL } }; - - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); - -@@ -67,9 +77,12 @@ fastcall void free_pidmap(int pid) - pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; - int offset = pid & BITS_PER_PAGE_MASK; - -+ BUG_ON(__is_virtual_pid(pid) || pid == 1); -+ - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); - } -+EXPORT_SYMBOL_GPL(free_pidmap); - - int alloc_pidmap(void) - { -@@ -77,6 +90,8 @@ int alloc_pidmap(void) - pidmap_t *map; - - pid = last + 1; -+ if (__is_virtual_pid(pid)) -+ pid += VPID_DIV; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; -@@ -106,6 +121,8 @@ int alloc_pidmap(void) - return pid; - } - offset = find_next_offset(map, offset); -+ if (__is_virtual_pid(offset)) -+ offset += VPID_DIV; - pid = mk_pid(map, offset); - /* - * find_next_offset() found a bit, the pid from it -@@ -130,6 +147,7 @@ int alloc_pidmap(void) - } - return -1; - } -+EXPORT_SYMBOL_GPL(alloc_pidmap); - - struct pid * fastcall find_pid(enum pid_type type, int nr) - { -@@ -143,6 +161,7 @@ struct pid * fastcall find_pid(enum pid_ - } - return NULL; - } -+EXPORT_SYMBOL(find_pid); - - int fastcall attach_pid(task_t *task, enum pid_type type, int nr) - { -@@ -162,6 +181,7 @@ int fastcall attach_pid(task_t *task, en - - return 0; - } -+EXPORT_SYMBOL_GPL(attach_pid); - - static fastcall int __detach_pid(task_t *task, enum pid_type type) - { -@@ -201,13 +221,27 @@ void fastcall detach_pid(task_t *task, e - if (tmp != type && find_pid(tmp, nr)) - return; - -+#ifdef CONFIG_VE -+ __free_vpid(task->pids[type].vnr, VE_TASK_INFO(task)->owner_env); -+#endif - free_pidmap(nr); - } -+EXPORT_SYMBOL_GPL(detach_pid); - - task_t *find_task_by_pid_type(int type, int nr) - { -+ BUG(); -+ return NULL; -+} -+ -+EXPORT_SYMBOL(find_task_by_pid_type); -+ -+task_t *find_task_by_pid_type_all(int type, int nr) -+{ - struct pid *pid; - -+ BUG_ON(nr != -1 && is_virtual_pid(nr)); -+ - pid = find_pid(type, nr); - if (!pid) - return NULL; -@@ -215,7 +249,35 @@ task_t *find_task_by_pid_type(int type, - return pid_task(&pid->pid_list, type); - } - --EXPORT_SYMBOL(find_task_by_pid_type); -+EXPORT_SYMBOL(find_task_by_pid_type_all); -+ -+#ifdef CONFIG_VE -+ -+task_t *find_task_by_pid_type_ve(int type, int nr) -+{ -+ task_t *tsk; -+ int gnr = nr; -+ struct pid *pid; -+ -+ if (is_virtual_pid(nr)) { -+ gnr = __vpid_to_pid(nr); -+ if (unlikely(gnr == -1)) -+ return NULL; -+ } -+ -+ pid = find_pid(type, gnr); -+ if (!pid) -+ return NULL; -+ -+ tsk = pid_task(&pid->pid_list, type); -+ if (!ve_accessible(VE_TASK_INFO(tsk)->owner_env, get_exec_env())) -+ return NULL; -+ return tsk; -+} -+ -+EXPORT_SYMBOL(find_task_by_pid_type_ve); -+ -+#endif - - /* - * This function switches the PIDs if a non-leader thread calls -@@ -234,12 +296,16 @@ void switch_exec_pids(task_t *leader, ta - - leader->pid = leader->tgid = thread->pid; - thread->pid = thread->tgid; -+ set_virt_tgid(leader, virt_pid(thread)); -+ set_virt_pid(leader, virt_pid(thread)); -+ set_virt_pid(thread, virt_tgid(thread)); - - attach_pid(thread, PIDTYPE_PID, thread->pid); - attach_pid(thread, PIDTYPE_TGID, thread->tgid); - attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); - attach_pid(thread, PIDTYPE_SID, thread->signal->session); - list_add_tail(&thread->tasks, &init_task.tasks); -+ SET_VE_LINKS(thread); - - attach_pid(leader, PIDTYPE_PID, leader->pid); - attach_pid(leader, PIDTYPE_TGID, leader->tgid); -@@ -247,6 +313,342 @@ void switch_exec_pids(task_t *leader, ta - attach_pid(leader, PIDTYPE_SID, leader->signal->session); - } - -+#ifdef CONFIG_VE -+ -+/* Virtual PID bits. -+ * -+ * At the moment all internal structures in kernel store real global pid. -+ * The only place, where virtual PID is used, is at user frontend. We -+ * remap virtual pids obtained from user to global ones (vpid_to_pid) and -+ * map globals to virtuals before showing them to user (virt_pid_type). -+ * -+ * We hold virtual PIDs inside struct pid, so map global -> virtual is easy. -+ */ -+ -+pid_t _pid_type_to_vpid(int type, pid_t pid) -+{ -+ struct pid * p; -+ -+ if (unlikely(is_virtual_pid(pid))) -+ return -1; -+ -+ read_lock(&tasklist_lock); -+ p = find_pid(type, pid); -+ if (p) { -+ pid = p->vnr; -+ } else { -+ pid = -1; -+ } -+ read_unlock(&tasklist_lock); -+ return pid; -+} -+EXPORT_SYMBOL_GPL(_pid_type_to_vpid); -+ -+pid_t pid_type_to_vpid(int type, pid_t pid) -+{ -+ int vpid; -+ -+ if (unlikely(pid <= 0)) -+ return pid; -+ -+ BUG_ON(is_virtual_pid(pid)); -+ -+ if (ve_is_super(get_exec_env())) -+ return pid; -+ -+ vpid = _pid_type_to_vpid(type, pid); -+ if (unlikely(vpid == -1)) { -+ /* It is allowed: global pid can be used everywhere. -+ * This can happen, when kernel remembers stray pids: -+ * signal queues, locks etc. -+ */ -+ vpid = pid; -+ } -+ return vpid; -+} -+EXPORT_SYMBOL_GPL(pid_type_to_vpid); -+ -+/* To map virtual pids to global we maintain special hash table. -+ * -+ * Mapping entries are allocated when a process with non-trivial -+ * mapping is forked, which is possible only after VE migrated. -+ * Mappings are destroyed, when a global pid is removed from global -+ * pidmap, which means we do not need to refcount mappings. -+ */ -+ -+static struct hlist_head *vpid_hash; -+ -+struct vpid_mapping -+{ -+ int vpid; -+ int veid; -+ int pid; -+ struct hlist_node link; -+}; -+ -+static kmem_cache_t *vpid_mapping_cachep; -+ -+static inline int vpid_hashfn(int vnr, int veid) -+{ -+ return hash_long((unsigned long)(vnr+(veid<<16)), pidhash_shift); -+} -+ -+struct vpid_mapping *__lookup_vpid_mapping(int vnr, int veid) -+{ -+ struct hlist_node *elem; -+ struct vpid_mapping *map; -+ -+ hlist_for_each_entry(map, elem, -+ &vpid_hash[vpid_hashfn(vnr, veid)], link) { -+ if (map->vpid == vnr && map->veid == veid) -+ return map; -+ } -+ return NULL; -+} -+ -+/* __vpid_to_pid() is raw version of vpid_to_pid(). It is to be used -+ * only under tasklist_lock. In some places we must use only this version -+ * (f.e. __kill_pg_info is called under write lock!) -+ * -+ * Caller should pass virtual pid. This function returns an error, when -+ * seeing a global pid. -+ */ -+int __vpid_to_pid(int pid) -+{ -+ struct vpid_mapping *map; -+ -+ if (unlikely(!is_virtual_pid(pid) || ve_is_super(get_exec_env()))) -+ return -1; -+ -+ if (!get_exec_env()->sparse_vpid) { -+ if (pid != 1) -+ return pid - VPID_DIV; -+ return get_exec_env()->init_entry->pid; -+ } -+ -+ map = __lookup_vpid_mapping(pid, VEID(get_exec_env())); -+ if (map) -+ return map->pid; -+ return -1; -+} -+EXPORT_SYMBOL_GPL(__vpid_to_pid); -+ -+int vpid_to_pid(int pid) -+{ -+ /* User gave bad pid. It is his problem. */ -+ if (unlikely(pid <= 0)) -+ return pid; -+ -+ if (!is_virtual_pid(pid)) -+ return pid; -+ -+ read_lock(&tasklist_lock); -+ pid = __vpid_to_pid(pid); -+ read_unlock(&tasklist_lock); -+ return pid; -+} -+EXPORT_SYMBOL_GPL(vpid_to_pid); -+ -+/* VEs which never migrated have trivial "arithmetic" mapping pid <-> vpid: -+ * -+ * vpid == 1 -> ve->init_task->pid -+ * else pid & ~VPID_DIV -+ * -+ * In this case VE has ve->sparse_vpid = 0 and we do not use vpid hash table. -+ * -+ * When VE migrates and we see non-trivial mapping the first time, we -+ * scan process table and populate mapping hash table. -+ */ -+ -+static int add_mapping(int pid, int vpid, int veid, struct hlist_head *cache) -+{ -+ if (pid > 0 && vpid > 0 && !__lookup_vpid_mapping(vpid, veid)) { -+ struct vpid_mapping *m; -+ if (hlist_empty(cache)) { -+ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_ATOMIC); -+ if (unlikely(m == NULL)) -+ return -ENOMEM; -+ } else { -+ m = hlist_entry(cache->first, struct vpid_mapping, link); -+ hlist_del(&m->link); -+ } -+ m->pid = pid; -+ m->vpid = vpid; -+ m->veid = veid; -+ hlist_add_head(&m->link, -+ &vpid_hash[vpid_hashfn(vpid, veid)]); -+ } -+ return 0; -+} -+ -+static int switch_to_sparse_mapping(int pid) -+{ -+ struct ve_struct *env = get_exec_env(); -+ struct hlist_head cache; -+ task_t *g, *t; -+ int pcount; -+ int err; -+ -+ /* Transition happens under write_lock_irq, so we try to make -+ * it more reliable and fast preallocating mapping entries. -+ * pcounter may be not enough, we could have lots of orphaned -+ * process groups and sessions, which also require mappings. -+ */ -+ INIT_HLIST_HEAD(&cache); -+ pcount = atomic_read(&env->pcounter); -+ err = -ENOMEM; -+ while (pcount > 0) { -+ struct vpid_mapping *m; -+ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL); -+ if (!m) -+ goto out; -+ hlist_add_head(&m->link, &cache); -+ pcount--; -+ } -+ -+ write_lock_irq(&tasklist_lock); -+ err = 0; -+ if (env->sparse_vpid) -+ goto out_unlock; -+ -+ err = -ENOMEM; -+ do_each_thread_ve(g, t) { -+ if (t->pid == pid) -+ continue; -+ if (add_mapping(t->pid, virt_pid(t), VEID(env), &cache)) -+ goto out_unlock; -+ } while_each_thread_ve(g, t); -+ -+ for_each_process_ve(t) { -+ if (t->pid == pid) -+ continue; -+ -+ if (add_mapping(t->tgid, virt_tgid(t), VEID(env), &cache)) -+ goto out_unlock; -+ if (add_mapping(t->signal->pgrp, virt_pgid(t), VEID(env), &cache)) -+ goto out_unlock; -+ if (add_mapping(t->signal->session, virt_sid(t), VEID(env), &cache)) -+ goto out_unlock; -+ } -+ env->sparse_vpid = 1; -+ err = 0; -+ -+out_unlock: -+ if (err) { -+ int i; -+ -+ for (i=0; i<(1<<pidhash_shift); i++) { -+ struct hlist_node *elem, *next; -+ struct vpid_mapping *map; -+ -+ hlist_for_each_entry_safe(map, elem, next, &vpid_hash[i], link) { -+ if (map->veid == VEID(env)) { -+ hlist_del(elem); -+ hlist_add_head(elem, &cache); -+ } -+ } -+ } -+ } -+ write_unlock_irq(&tasklist_lock); -+ -+out: -+ while (!hlist_empty(&cache)) { -+ struct vpid_mapping *m; -+ m = hlist_entry(cache.first, struct vpid_mapping, link); -+ hlist_del(&m->link); -+ kmem_cache_free(vpid_mapping_cachep, m); -+ } -+ return err; -+} -+ -+int alloc_vpid(int pid, int virt_pid) -+{ -+ int result; -+ struct vpid_mapping *m; -+ struct ve_struct *env = get_exec_env(); -+ -+ if (ve_is_super(env) || !env->virt_pids) -+ return pid; -+ -+ if (!env->sparse_vpid) { -+ if (virt_pid == -1) -+ return pid + VPID_DIV; -+ -+ if (virt_pid == 1 || virt_pid == pid + VPID_DIV) -+ return virt_pid; -+ -+ if ((result = switch_to_sparse_mapping(pid)) < 0) -+ return result; -+ } -+ -+ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL); -+ if (!m) -+ return -ENOMEM; -+ -+ m->pid = pid; -+ m->veid = VEID(env); -+ -+ result = (virt_pid == -1) ? pid + VPID_DIV : virt_pid; -+ -+ write_lock_irq(&tasklist_lock); -+ if (unlikely(__lookup_vpid_mapping(result, m->veid))) { -+ if (virt_pid > 0) { -+ result = -EEXIST; -+ goto out; -+ } -+ -+ /* No luck. Now we search for some not-existing vpid. -+ * It is weak place. We do linear search. */ -+ do { -+ result++; -+ if (!__is_virtual_pid(result)) -+ result += VPID_DIV; -+ if (result >= pid_max) -+ result = RESERVED_PIDS + VPID_DIV; -+ } while (__lookup_vpid_mapping(result, m->veid) != NULL); -+ -+ /* And set last_pid in hope future alloc_pidmap to avoid -+ * collisions after future alloc_pidmap() */ -+ last_pid = result - VPID_DIV; -+ } -+ if (result > 0) { -+ m->vpid = result; -+ hlist_add_head(&m->link, -+ &vpid_hash[vpid_hashfn(result, m->veid)]); -+ } -+out: -+ write_unlock_irq(&tasklist_lock); -+ if (result < 0) -+ kmem_cache_free(vpid_mapping_cachep, m); -+ return result; -+} -+EXPORT_SYMBOL(alloc_vpid); -+ -+static void __free_vpid(int vpid, struct ve_struct *ve) -+{ -+ struct vpid_mapping *m; -+ -+ if (!ve->sparse_vpid) -+ return; -+ -+ if (!__is_virtual_pid(vpid) && (vpid != 1 || ve_is_super(ve))) -+ return; -+ -+ m = __lookup_vpid_mapping(vpid, ve->veid); -+ BUG_ON(m == NULL); -+ hlist_del(&m->link); -+ kmem_cache_free(vpid_mapping_cachep, m); -+} -+ -+void free_vpid(int vpid, struct ve_struct *ve) -+{ -+ write_lock_irq(&tasklist_lock); -+ __free_vpid(vpid, ve); -+ write_unlock_irq(&tasklist_lock); -+} -+EXPORT_SYMBOL(free_vpid); -+#endif -+ - /* - * The pid hash table is scaled according to the amount of memory in the - * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or -@@ -273,6 +675,14 @@ void __init pidhash_init(void) - for (j = 0; j < pidhash_size; j++) - INIT_HLIST_HEAD(&pid_hash[i][j]); - } -+ -+#ifdef CONFIG_VE -+ vpid_hash = alloc_bootmem(pidhash_size * sizeof(struct hlist_head)); -+ if (!vpid_hash) -+ panic("Could not alloc vpid_hash!\n"); -+ for (j = 0; j < pidhash_size; j++) -+ INIT_HLIST_HEAD(&vpid_hash[j]); -+#endif - } - - void __init pidmap_init(void) -@@ -289,4 +699,12 @@ void __init pidmap_init(void) - - for (i = 0; i < PIDTYPE_MAX; i++) - attach_pid(current, i, 0); -+ -+#ifdef CONFIG_VE -+ vpid_mapping_cachep = -+ kmem_cache_create("vpid_mapping", -+ sizeof(struct vpid_mapping), -+ __alignof__(struct vpid_mapping), -+ SLAB_PANIC|SLAB_UBC, NULL, NULL); -+#endif - } -diff -upr linux-2.6.16.orig/kernel/posix-cpu-timers.c linux-2.6.16-026test009/kernel/posix-cpu-timers.c ---- linux-2.6.16.orig/kernel/posix-cpu-timers.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/posix-cpu-timers.c 2006-04-19 15:02:12.000000000 +0400 -@@ -20,7 +20,7 @@ static int check_clock(const clockid_t w - return 0; - - read_lock(&tasklist_lock); -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? - p->tgid != current->tgid : p->tgid != pid)) { - error = -EINVAL; -@@ -292,7 +292,7 @@ int posix_cpu_clock_get(const clockid_t - */ - struct task_struct *p; - read_lock(&tasklist_lock); -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - if (p) { - if (CPUCLOCK_PERTHREAD(which_clock)) { - if (p->tgid == current->tgid) { -@@ -336,7 +336,7 @@ int posix_cpu_timer_create(struct k_itim - if (pid == 0) { - p = current; - } else { -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - if (p && p->tgid != current->tgid) - p = NULL; - } -@@ -344,7 +344,7 @@ int posix_cpu_timer_create(struct k_itim - if (pid == 0) { - p = current->group_leader; - } else { -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - if (p && p->tgid != pid) - p = NULL; - } -diff -upr linux-2.6.16.orig/kernel/posix-timers.c linux-2.6.16-026test009/kernel/posix-timers.c ---- linux-2.6.16.orig/kernel/posix-timers.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/posix-timers.c 2006-04-19 15:02:12.000000000 +0400 -@@ -31,6 +31,7 @@ - * POSIX clocks & timers - */ - #include <linux/mm.h> -+#include <linux/module.h> - #include <linux/smp_lock.h> - #include <linux/interrupt.h> - #include <linux/slab.h> -@@ -48,6 +49,8 @@ - #include <linux/workqueue.h> - #include <linux/module.h> - -+#include <ub/beancounter.h> -+ - /* - * Management arrays for POSIX timers. Timers are kept in slab memory - * Timer ids are allocated by an external routine that keeps track of the -@@ -241,7 +244,8 @@ static __init int init_posix_timers(void - register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); - - posix_timers_cache = kmem_cache_create("posix_timers_cache", -- sizeof (struct k_itimer), 0, 0, NULL, NULL); -+ sizeof (struct k_itimer), 0, -+ SLAB_UBC, NULL, NULL); - idr_init(&posix_timers_id); - return 0; - } -@@ -294,6 +298,13 @@ void do_schedule_next_timer(struct sigin - - int posix_timer_event(struct k_itimer *timr,int si_private) - { -+ int ret; -+ struct ve_struct *ve; -+ struct user_beancounter *ub; -+ -+ ve = set_exec_env(timr->it_process->ve_task_info.owner_env); -+ ub = set_exec_ub(timr->it_process->task_bc.task_ub); -+ - memset(&timr->sigq->info, 0, sizeof(siginfo_t)); - timr->sigq->info.si_sys_private = si_private; - /* Send signal to the process that owns this timer.*/ -@@ -306,11 +317,11 @@ int posix_timer_event(struct k_itimer *t - - if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - struct task_struct *leader; -- int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, -+ ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); - - if (likely(ret >= 0)) -- return ret; -+ goto out; - - timr->it_sigev_notify = SIGEV_SIGNAL; - leader = timr->it_process->group_leader; -@@ -318,8 +329,12 @@ int posix_timer_event(struct k_itimer *t - timr->it_process = leader; - } - -- return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, -+ ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); -+out: -+ (void)set_exec_ub(ub); -+ (void)set_exec_env(ve); -+ return ret; - } - EXPORT_SYMBOL_GPL(posix_timer_event); - -@@ -366,7 +381,7 @@ static struct task_struct * good_sigeven - struct task_struct *rtn = current->group_leader; - - if ((event->sigev_notify & SIGEV_THREAD_ID ) && -- (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || -+ (!(rtn = find_task_by_pid_ve(event->sigev_notify_thread_id)) || - rtn->tgid != current->tgid || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) - return NULL; -diff -upr linux-2.6.16.orig/kernel/power/Kconfig linux-2.6.16-026test009/kernel/power/Kconfig ---- linux-2.6.16.orig/kernel/power/Kconfig 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/power/Kconfig 2006-04-19 15:02:13.000000000 +0400 -@@ -38,7 +38,7 @@ config PM_DEBUG - - config SOFTWARE_SUSPEND - bool "Software Suspend" -- depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) -+ depends on PM && SWAP && X86 || ((FRV || PPC32) && !SMP) - ---help--- - Enable the possibility of suspending the machine. - It doesn't need APM. -diff -upr linux-2.6.16.orig/kernel/power/process.c linux-2.6.16-026test009/kernel/power/process.c ---- linux-2.6.16.orig/kernel/power/process.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/power/process.c 2006-04-19 15:02:12.000000000 +0400 -@@ -38,18 +38,23 @@ void refrigerator(void) - processes around? */ - long save; - save = current->state; -+ current->state = TASK_UNINTERRUPTIBLE; - pr_debug("%s entered refrigerator\n", current->comm); -- printk("="); -+ /* printk("="); */ - -- frozen_process(current); - spin_lock_irq(¤t->sighand->siglock); -- recalc_sigpending(); /* We sent fake signal, clean it up */ -+ if (test_and_clear_thread_flag(TIF_FREEZE)) { -+ recalc_sigpending(); /* We sent fake signal, clean it up */ -+ current->flags |= PF_FROZEN; -+ } else { -+ /* Freeze request could be canceled before we entered -+ * refrigerator(). In this case we do nothing. */ -+ current->state = save; -+ } - spin_unlock_irq(¤t->sighand->siglock); - -- while (frozen(current)) { -- current->state = TASK_UNINTERRUPTIBLE; -+ while (current->flags & PF_FROZEN) - schedule(); -- } - pr_debug("%s left refrigerator\n", current->comm); - current->state = save; - } -@@ -67,7 +72,7 @@ int freeze_processes(void) - do { - todo = 0; - read_lock(&tasklist_lock); -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - if (!freezeable(p)) - continue; - if (frozen(p)) -@@ -78,7 +83,7 @@ int freeze_processes(void) - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - todo++; -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - read_unlock(&tasklist_lock); - yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, start_time + TIMEOUT)) { -@@ -95,15 +100,15 @@ int freeze_processes(void) - */ - if (todo) { - read_lock(&tasklist_lock); -- do_each_thread(g, p) -+ do_each_thread_all(g, p) - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); -- p->flags &= ~PF_FREEZE; - spin_lock_irqsave(&p->sighand->siglock, flags); -+ clear_tsk_thread_flag(p, TIF_FREEZE); - recalc_sigpending_tsk(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -- while_each_thread(g, p); -+ while_each_thread_all(g, p); - read_unlock(&tasklist_lock); - return todo; - } -@@ -119,12 +124,12 @@ void thaw_processes(void) - - printk( "Restarting tasks..." ); - read_lock(&tasklist_lock); -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - if (!freezeable(p)) - continue; - if (!thaw_process(p)) - printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - - read_unlock(&tasklist_lock); - schedule(); -diff -upr linux-2.6.16.orig/kernel/printk.c linux-2.6.16-026test009/kernel/printk.c ---- linux-2.6.16.orig/kernel/printk.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/printk.c 2006-04-19 15:02:12.000000000 +0400 -@@ -30,7 +30,9 @@ - #include <linux/smp.h> - #include <linux/security.h> - #include <linux/bootmem.h> -+#include <linux/vzratelimit.h> - #include <linux/syscalls.h> -+#include <linux/veprintk.h> - - #include <asm/uaccess.h> - -@@ -83,7 +85,7 @@ static int console_locked; - * It is also used in interesting ways to provide interlocking in - * release_console_sem(). - */ --static DEFINE_SPINLOCK(logbuf_lock); -+DEFINE_SPINLOCK(logbuf_lock); - - #define LOG_BUF_MASK (log_buf_len-1) - #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) -@@ -179,18 +181,18 @@ static int __init log_buf_len_setup(char - - spin_lock_irqsave(&logbuf_lock, flags); - log_buf_len = size; -- log_buf = new_log_buf; -+ ve_log_buf = new_log_buf; - -- offset = start = min(con_start, log_start); -+ offset = start = min(con_start, ve_log_start); - dest_idx = 0; -- while (start != log_end) { -- log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; -+ while (start != ve_log_end) { -+ ve_log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; - start++; - dest_idx++; - } -- log_start -= offset; -+ ve_log_start -= offset; - con_start -= offset; -- log_end -= offset; -+ ve_log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); - - printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); -@@ -223,6 +225,10 @@ int do_syslog(int type, char __user *buf - char c; - int error = 0; - -+ if (!ve_is_super(get_exec_env()) && -+ (type == 6 || type == 7 || type == 8)) -+ goto out; -+ - error = security_syslog(type); - if (error) - return error; -@@ -243,15 +249,15 @@ int do_syslog(int type, char __user *buf - error = -EFAULT; - goto out; - } -- error = wait_event_interruptible(log_wait, -- (log_start - log_end)); -+ error = wait_event_interruptible(ve_log_wait, -+ (ve_log_start - ve_log_end)); - if (error) - goto out; - i = 0; - spin_lock_irq(&logbuf_lock); -- while (!error && (log_start != log_end) && i < len) { -- c = LOG_BUF(log_start); -- log_start++; -+ while (!error && (ve_log_start != ve_log_end) && i < len) { -+ c = VE_LOG_BUF(ve_log_start); -+ ve_log_start++; - spin_unlock_irq(&logbuf_lock); - error = __put_user(c,buf); - buf++; -@@ -277,15 +283,17 @@ int do_syslog(int type, char __user *buf - error = -EFAULT; - goto out; - } -+ if (ve_log_buf == NULL) -+ goto out; - count = len; -- if (count > log_buf_len) -- count = log_buf_len; -+ if (count > ve_log_buf_len) -+ count = ve_log_buf_len; - spin_lock_irq(&logbuf_lock); -- if (count > logged_chars) -- count = logged_chars; -+ if (count > ve_logged_chars) -+ count = ve_logged_chars; - if (do_clear) -- logged_chars = 0; -- limit = log_end; -+ ve_logged_chars = 0; -+ limit = ve_log_end; - /* - * __put_user() could sleep, and while we sleep - * printk() could overwrite the messages -@@ -294,9 +302,9 @@ int do_syslog(int type, char __user *buf - */ - for (i = 0; i < count && !error; i++) { - j = limit-1-i; -- if (j + log_buf_len < log_end) -+ if (j + ve_log_buf_len < ve_log_end) - break; -- c = LOG_BUF(j); -+ c = VE_LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); - error = __put_user(c,&buf[count-1-i]); - cond_resched(); -@@ -320,7 +328,7 @@ int do_syslog(int type, char __user *buf - } - break; - case 5: /* Clear ring buffer */ -- logged_chars = 0; -+ ve_logged_chars = 0; - break; - case 6: /* Disable logging to console */ - console_loglevel = minimum_console_loglevel; -@@ -338,10 +346,10 @@ int do_syslog(int type, char __user *buf - error = 0; - break; - case 9: /* Number of chars in the log buffer */ -- error = log_end - log_start; -+ error = ve_log_end - ve_log_start; - break; - case 10: /* Size of the log buffer */ -- error = log_buf_len; -+ error = ve_log_buf_len; - break; - default: - error = -EINVAL; -@@ -365,7 +373,7 @@ static void __call_console_drivers(unsig - - for (con = console_drivers; con; con = con->next) { - if ((con->flags & CON_ENABLED) && con->write) -- con->write(con, &LOG_BUF(start), end - start); -+ con->write(con, &VE_LOG_BUF(start), end - start); - } - } - -@@ -377,11 +385,11 @@ static void _call_console_drivers(unsign - { - if (msg_log_level < console_loglevel && - console_drivers && start != end) { -- if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { -+ if ((start & VE_LOG_BUF_MASK) > (end & VE_LOG_BUF_MASK)) { - /* wrapped write */ -- __call_console_drivers(start & LOG_BUF_MASK, -- log_buf_len); -- __call_console_drivers(0, end & LOG_BUF_MASK); -+ __call_console_drivers(start & VE_LOG_BUF_MASK, -+ ve_log_buf_len); -+ __call_console_drivers(0, end & VE_LOG_BUF_MASK); - } else { - __call_console_drivers(start, end); - } -@@ -405,16 +413,16 @@ static void call_console_drivers(unsigne - start_print = start; - while (cur_index != end) { - if (msg_level < 0 && ((end - cur_index) > 2) && -- LOG_BUF(cur_index + 0) == '<' && -- LOG_BUF(cur_index + 1) >= '0' && -- LOG_BUF(cur_index + 1) <= '7' && -- LOG_BUF(cur_index + 2) == '>') { -- msg_level = LOG_BUF(cur_index + 1) - '0'; -+ VE_LOG_BUF(cur_index + 0) == '<' && -+ VE_LOG_BUF(cur_index + 1) >= '0' && -+ VE_LOG_BUF(cur_index + 1) <= '7' && -+ VE_LOG_BUF(cur_index + 2) == '>') { -+ msg_level = VE_LOG_BUF(cur_index + 1) - '0'; - cur_index += 3; - start_print = cur_index; - } - while (cur_index != end) { -- char c = LOG_BUF(cur_index); -+ char c = VE_LOG_BUF(cur_index); - - cur_index++; - if (c == '\n') { -@@ -439,14 +447,14 @@ static void call_console_drivers(unsigne - - static void emit_log_char(char c) - { -- LOG_BUF(log_end) = c; -- log_end++; -- if (log_end - log_start > log_buf_len) -- log_start = log_end - log_buf_len; -- if (log_end - con_start > log_buf_len) -- con_start = log_end - log_buf_len; -- if (logged_chars < log_buf_len) -- logged_chars++; -+ VE_LOG_BUF(ve_log_end) = c; -+ ve_log_end++; -+ if (ve_log_end - ve_log_start > ve_log_buf_len) -+ ve_log_start = ve_log_end - ve_log_buf_len; -+ if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len) -+ con_start = ve_log_end - ve_log_buf_len; -+ if (ve_logged_chars < ve_log_buf_len) -+ ve_logged_chars++; - } - - /* -@@ -511,18 +519,68 @@ __attribute__((weak)) unsigned long long - * printf(3) - */ - -+static inline int ve_log_init(void) -+{ -+#ifdef CONFIG_VE -+ if (ve_log_buf != NULL) -+ return 0; -+ -+ if (ve_is_super(get_exec_env())) { -+ ve0._log_wait = &log_wait; -+ ve0._log_start = &log_start; -+ ve0._log_end = &log_end; -+ ve0._logged_chars = &logged_chars; -+ ve0.log_buf = log_buf; -+ return 0; -+ } -+ -+ ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC); -+ if (!ve_log_buf) -+ return -ENOMEM; -+ -+ memset(ve_log_buf, 0, ve_log_buf_len); -+#endif -+ return 0; -+} -+ - asmlinkage int printk(const char *fmt, ...) - { - va_list args; - int r; -+ struct ve_struct *ve; - - va_start(args, fmt); -+ ve = set_exec_env(get_ve0()); - r = vprintk(fmt, args); -+ (void)set_exec_env(ve); - va_end(args); - - return r; - } - -+asmlinkage int ve_printk(int dst, const char *fmt, ...) -+{ -+ va_list args; -+ int printed_len; -+ -+ printed_len = 0; -+ if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) { -+ struct ve_struct *env; -+ va_start(args, fmt); -+ env = set_exec_env(get_ve0()); -+ printed_len = vprintk(fmt, args); -+ (void)set_exec_env(env); -+ va_end(args); -+ } -+ if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) { -+ va_start(args, fmt); -+ printed_len = vprintk(fmt, args); -+ va_end(args); -+ } -+ return printed_len; -+} -+EXPORT_SYMBOL(ve_printk); -+ - /* cpu currently holding logbuf_lock */ - static volatile unsigned int printk_cpu = UINT_MAX; - -@@ -533,6 +591,7 @@ asmlinkage int vprintk(const char *fmt, - char *p; - static char printk_buf[1024]; - static int log_level_unknown = 1; -+ int err, need_wake; - - preempt_disable(); - if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) -@@ -544,6 +603,12 @@ asmlinkage int vprintk(const char *fmt, - spin_lock_irqsave(&logbuf_lock, flags); - printk_cpu = smp_processor_id(); - -+ err = ve_log_init(); -+ if (err) { -+ spin_unlock_irqrestore(&logbuf_lock, flags); -+ return err; -+ } -+ - /* Emit the output into the temporary buffer */ - printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); - -@@ -615,7 +680,12 @@ asmlinkage int vprintk(const char *fmt, - spin_unlock_irqrestore(&logbuf_lock, flags); - goto out; - } -- if (!down_trylock(&console_sem)) { -+ if (!ve_is_super(get_exec_env())) { -+ need_wake = (ve_log_start != ve_log_end); -+ spin_unlock_irqrestore(&logbuf_lock, flags); -+ if (!oops_in_progress && need_wake) -+ wake_up_interruptible(&ve_log_wait); -+ } else if (!down_trylock(&console_sem)) { - console_locked = 1; - /* - * We own the drivers. We can drop the spinlock and let -@@ -732,6 +802,12 @@ int is_console_locked(void) - } - EXPORT_SYMBOL(is_console_locked); - -+void wake_up_klogd(void) -+{ -+ if (!oops_in_progress && waitqueue_active(&ve_log_wait)) -+ wake_up_interruptible(&ve_log_wait); -+} -+ - /** - * release_console_sem - unlock the console system - * -@@ -754,12 +830,12 @@ void release_console_sem(void) - - for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); -- wake_klogd |= log_start - log_end; -- if (con_start == log_end) -+ wake_klogd |= ve_log_start - ve_log_end; -+ if (con_start == ve_log_end) - break; /* Nothing to print */ - _con_start = con_start; -- _log_end = log_end; -- con_start = log_end; /* Flush */ -+ _log_end = ve_log_end; -+ con_start = ve_log_end; /* Flush */ - spin_unlock(&logbuf_lock); - call_console_drivers(_con_start, _log_end); - local_irq_restore(flags); -@@ -768,8 +844,8 @@ void release_console_sem(void) - console_may_schedule = 0; - up(&console_sem); - spin_unlock_irqrestore(&logbuf_lock, flags); -- if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) -- wake_up_interruptible(&log_wait); -+ if (wake_klogd) -+ wake_up_klogd(); - } - EXPORT_SYMBOL(release_console_sem); - -@@ -940,7 +1016,7 @@ void register_console(struct console *co - * for us. - */ - spin_lock_irqsave(&logbuf_lock, flags); -- con_start = log_start; -+ con_start = ve_log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); - } - release_console_sem(); -@@ -1049,3 +1125,33 @@ int printk_ratelimit(void) - printk_ratelimit_burst); - } - EXPORT_SYMBOL(printk_ratelimit); -+ -+/* -+ * Rate limiting stuff. -+ */ -+int vz_ratelimit(struct vz_rate_info *p) -+{ -+ unsigned long cjif, djif; -+ unsigned long flags; -+ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; -+ long new_bucket; -+ -+ spin_lock_irqsave(&ratelimit_lock, flags); -+ cjif = jiffies; -+ djif = cjif - p->last; -+ if (djif < p->interval) { -+ if (p->bucket >= p->burst) { -+ spin_unlock_irqrestore(&ratelimit_lock, flags); -+ return 0; -+ } -+ p->bucket++; -+ } else { -+ new_bucket = p->bucket - (djif / (unsigned)p->interval); -+ if (new_bucket < 0) -+ new_bucket = 0; -+ p->bucket = new_bucket + 1; -+ } -+ p->last = cjif; -+ spin_unlock_irqrestore(&ratelimit_lock, flags); -+ return 1; -+} -diff -upr linux-2.6.16.orig/kernel/ptrace.c linux-2.6.16-026test009/kernel/ptrace.c ---- linux-2.6.16.orig/kernel/ptrace.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ptrace.c 2006-04-19 15:02:12.000000000 +0400 -@@ -57,10 +57,6 @@ void ptrace_untrace(task_t *child) - signal_wake_up(child, 1); - } - } -- if (child->signal->flags & SIGNAL_GROUP_EXIT) { -- sigaddset(&child->pending.signal, SIGKILL); -- signal_wake_up(child, 1); -- } - spin_unlock(&child->sighand->siglock); - } - -@@ -82,7 +78,8 @@ void __ptrace_unlink(task_t *child) - SET_LINKS(child); - } - -- ptrace_untrace(child); -+ if (child->state == TASK_TRACED) -+ ptrace_untrace(child); - } - - /* -@@ -136,7 +133,10 @@ static int may_attach(struct task_struct - smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) - return -EPERM; -- -+ if (!task->mm->vps_dumpable && !ve_is_super(get_exec_env())) -+ return -EPERM; -+ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env())) -+ return -EPERM; - return security_ptrace(current, task); - } - -@@ -263,6 +263,7 @@ int access_process_vm(struct task_struct - - return buf - old_buf; - } -+EXPORT_SYMBOL_GPL(access_process_vm); - - int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) - { -@@ -459,7 +460,7 @@ struct task_struct *ptrace_get_task_stru - return ERR_PTR(-EPERM); - - read_lock(&tasklist_lock); -- child = find_task_by_pid(pid); -+ child = find_task_by_pid_ve(pid); - if (child) - get_task_struct(child); - read_unlock(&tasklist_lock); -diff -upr linux-2.6.16.orig/kernel/sched.c linux-2.6.16-026test009/kernel/sched.c ---- linux-2.6.16.orig/kernel/sched.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/sched.c 2006-04-19 15:02:12.000000000 +0400 -@@ -220,6 +220,9 @@ struct runqueue { - */ - unsigned long nr_uninterruptible; - -+ unsigned long nr_sleeping; -+ unsigned long nr_stopped; -+ - unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; - task_t *curr, *idle; -@@ -237,6 +240,7 @@ struct runqueue { - - task_t *migration_thread; - struct list_head migration_queue; -+ int cpu; - #endif - - #ifdef CONFIG_SCHEDSTATS -@@ -284,6 +288,11 @@ for (domain = rcu_dereference(cpu_rq(cpu - # define finish_arch_switch(prev) do { } while (0) - #endif - -+struct kernel_stat_glob kstat_glob; -+spinlock_t kstat_glb_lock = SPIN_LOCK_UNLOCKED; -+EXPORT_SYMBOL(kstat_glob); -+EXPORT_SYMBOL(kstat_glb_lock); -+ - #ifndef __ARCH_WANT_UNLOCKED_CTXSW - static inline int task_running(runqueue_t *rq, task_t *p) - { -@@ -374,6 +383,186 @@ static inline void task_rq_unlock(runque - spin_unlock_irqrestore(&rq->lock, *flags); - } - -+#ifdef CONFIG_VE -+#define ve_nr_iowait_inc(env, cpu) \ -+ do { \ -+ VE_CPU_STATS((env), (cpu))->nr_iowait++; \ -+ } while(0) -+#define ve_nr_iowait_dec(env, cpu) \ -+ do { \ -+ VE_CPU_STATS((env), (cpu))->nr_iowait--; \ -+ } while(0) -+#define ve_nr_unint_inc(env, cpu) \ -+ do { \ -+ VE_CPU_STATS((env), (cpu))->nr_unint++; \ -+ } while(0) -+#define ve_nr_unint_dec(env, cpu) \ -+ do { \ -+ VE_CPU_STATS((env), (cpu))->nr_unint--; \ -+ } while(0) -+ -+#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0) -+ -+cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu) -+{ -+ struct ve_cpu_stats *ve_stat; -+ unsigned v; -+ cycles_t strt, ret, cycles; -+ -+ ve_stat = VE_CPU_STATS(ve, cpu); -+ do { -+ v = read_seqcount_begin(&ve_stat->stat_lock); -+ ret = ve_stat->idle_time; -+ strt = ve_stat->strt_idle_time; -+ if (strt && nr_uninterruptible_ve(ve) == 0) { -+ cycles = get_cycles(); -+ if (cycles_after(cycles, strt)) -+ ret += cycles - strt; -+ } -+ } while (read_seqcount_retry(&ve_stat->stat_lock, v)); -+ return ret; -+} -+EXPORT_SYMBOL(ve_sched_get_idle_time); -+ -+cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu) -+{ -+ struct ve_cpu_stats *ve_stat; -+ unsigned v; -+ cycles_t strt, ret, cycles; -+ -+ ve_stat = VE_CPU_STATS(ve, cpu); -+ do { -+ v = read_seqcount_begin(&ve_stat->stat_lock); -+ ret = ve_stat->iowait_time; -+ strt = ve_stat->strt_idle_time; -+ if (strt && nr_uninterruptible_ve(ve) > 0) { -+ cycles = get_cycles(); -+ if (cycles_after(cycles, strt)) -+ ret += cycles - strt; -+ } -+ } while (read_seqcount_retry(&ve_stat->stat_lock, v)); -+ return ret; -+} -+ -+EXPORT_SYMBOL(ve_sched_get_iowait_time); -+ -+static inline void ve_stop_idle(struct ve_struct *ve, -+ unsigned int cpu, cycles_t cycles) -+{ -+ struct ve_cpu_stats *ve_stat; -+ -+ ve_stat = VE_CPU_STATS(ve, cpu); -+ -+ write_seqcount_begin(&ve_stat->stat_lock); -+ if (ve_stat->strt_idle_time) { -+ if (cycles_after(cycles, ve_stat->strt_idle_time)) { -+ if (nr_uninterruptible_ve(ve) == 0) -+ ve_stat->idle_time += cycles - -+ ve_stat->strt_idle_time; -+ else -+ ve_stat->iowait_time += cycles - -+ ve_stat->strt_idle_time; -+ } -+ ve_stat->strt_idle_time = 0; -+ } -+ write_seqcount_end(&ve_stat->stat_lock); -+} -+ -+static inline void ve_strt_idle(struct ve_struct *ve, -+ unsigned int cpu, cycles_t cycles) -+{ -+ struct ve_cpu_stats *ve_stat; -+ -+ ve_stat = VE_CPU_STATS(ve, cpu); -+ -+ write_seqcount_begin(&ve_stat->stat_lock); -+ ve_stat->strt_idle_time = cycles; -+ write_seqcount_end(&ve_stat->stat_lock); -+} -+ -+#define ve_nr_running_inc(env, cpu, cycles) do { \ -+ if (++VE_CPU_STATS((env), (cpu))->nr_running == 1) \ -+ ve_stop_idle(env, cpu, cycles); \ -+ } while (0) -+#define ve_nr_running_dec(env, cpu, cyclses) do { \ -+ if (--VE_CPU_STATS((env), (cpu))->nr_running == 0) \ -+ ve_strt_idle(env, cpu, cycles); \ -+ } while (0) -+ -+void ve_sched_attach(struct ve_struct *envid) -+{ -+ struct task_struct *tsk; -+ unsigned int cpu; -+ cycles_t cycles; -+ -+ tsk = current; -+ preempt_disable(); -+ cycles = get_cycles(); -+ cpu = task_cpu(tsk); -+ ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles); -+ ve_nr_running_inc(envid, cpu, cycles); -+ preempt_enable(); -+} -+EXPORT_SYMBOL(ve_sched_attach); -+ -+static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc) -+{ -+ struct ve_task_info *ti; -+ -+ ti = VE_TASK_INFO(p); -+ write_seqcount_begin(&ti->wakeup_lock); -+ ti->wakeup_stamp = cyc; -+ write_seqcount_end(&ti->wakeup_lock); -+} -+ -+static inline void update_sched_lat(struct task_struct *t, cycles_t cycles) -+{ -+ int cpu; -+ cycles_t ve_wstamp; -+ -+ /* safe due to runqueue lock */ -+ cpu = smp_processor_id(); -+ ve_wstamp = t->ve_task_info.wakeup_stamp; -+ -+ if (ve_wstamp && cycles > ve_wstamp) { -+ KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat, -+ cpu, cycles - ve_wstamp); -+ KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve, -+ cpu, cycles - ve_wstamp); -+ } -+} -+ -+static inline void update_ve_task_info(task_t *prev, cycles_t cycles) -+{ -+#ifdef CONFIG_FAIRSCHED -+ if (prev != this_pcpu()->idle) { -+#else -+ if (prev != this_rq()->idle) { -+#endif -+ VE_CPU_STATS(prev->ve_task_info.owner_env, -+ smp_processor_id())->used_time += -+ cycles - prev->ve_task_info.sched_time; -+ -+ prev->ve_task_info.sched_time = cycles; -+ } -+} -+ -+#else -+#define ve_nr_running_inc(env, cpu, cycles) do { } while(0) -+#define ve_nr_running_dec(env, cpu, cycles) do { } while(0) -+#define ve_nr_iowait_inc(env, cpu) do { } while(0) -+#define ve_nr_iowait_dec(env, cpu) do { } while(0) -+#define ve_nr_unint_inc(env, cpu) do { } while(0) -+#define ve_nr_unint_dec(env, cpu) do { } while(0) -+#define update_ve_task_info(prev, cycles) do { } while (0) -+#endif -+ -+unsigned long nr_zombie = 0; /* protected by tasklist_lock */ -+EXPORT_SYMBOL(nr_zombie); -+ -+atomic_t nr_dead = ATOMIC_INIT(0); -+EXPORT_SYMBOL(nr_dead); -+ - #ifdef CONFIG_SCHEDSTATS - /* - * bump this up when changing the output format or the meaning of an existing -@@ -666,8 +855,16 @@ static int effective_prio(task_t *p) - */ - static inline void __activate_task(task_t *p, runqueue_t *rq) - { -+ cycles_t cycles; -+ -+#ifdef CONFIG_VE -+ cycles = get_cycles(); -+ write_wakeup_stamp(p, cycles); -+ p->ve_task_info.sleep_time += cycles; -+#endif - enqueue_task(p, rq->active); - rq->nr_running++; -+ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles); - } - - /* -@@ -800,6 +997,25 @@ static void activate_task(task_t *p, run - */ - static void deactivate_task(struct task_struct *p, runqueue_t *rq) - { -+ cycles_t cycles; -+#ifdef CONFIG_VE -+ unsigned int cpu; -+ struct ve_struct *ve; -+ -+ cycles = get_cycles(); -+ cpu = task_cpu(p); -+ ve = p->ve_task_info.owner_env; -+ -+ p->ve_task_info.sleep_time -= cycles; -+#endif -+ if (p->state == TASK_UNINTERRUPTIBLE) -+ ve_nr_unint_inc(ve, cpu); -+ if (p->state == TASK_INTERRUPTIBLE) -+ rq->nr_sleeping++; -+ if (p->state == TASK_STOPPED) -+ rq->nr_stopped++; -+ -+ ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles); - rq->nr_running--; - dequeue_task(p, p->array); - p->array = NULL; -@@ -913,6 +1129,7 @@ repeat: - } - task_rq_unlock(rq, &flags); - } -+EXPORT_SYMBOL_GPL(wait_task_inactive); - - /*** - * kick_process - kick a running thread to enter/exit the kernel -@@ -1269,7 +1486,13 @@ out_set_cpu: - - out_activate: - #endif /* CONFIG_SMP */ -- if (old_state == TASK_UNINTERRUPTIBLE) { -+ if (old_state == TASK_INTERRUPTIBLE) -+ rq->nr_sleeping--; -+ else if (old_state == TASK_STOPPED) -+ rq->nr_stopped--; -+ else if (old_state == TASK_UNINTERRUPTIBLE) { -+ ve_nr_unint_dec(p->ve_task_info.owner_env, -+ smp_processor_id()); - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn -@@ -1369,6 +1592,10 @@ void fastcall sched_fork(task_t *p, int - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); -+#ifdef CONFIG_VE -+ /*cosmetic: sleep till wakeup below*/ -+ p->ve_task_info.sleep_time -= get_cycles(); -+#endif - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only -@@ -1426,6 +1653,8 @@ void fastcall wake_up_new_task(task_t *p - p->array = current->array; - p->array->nr_active++; - rq->nr_running++; -+ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, -+ task_cpu(p), get_cycles()); - } - set_need_resched(); - } else -@@ -1569,8 +1798,9 @@ asmlinkage void schedule_tail(task_t *pr - preempt_enable(); - #endif - if (current->set_child_tid) -- put_user(current->pid, current->set_child_tid); -+ put_user(virt_pid(current), current->set_child_tid); - } -+EXPORT_SYMBOL_GPL(schedule_tail); - - /* - * context_switch - switch to the new MM and the new -@@ -1617,6 +1847,7 @@ unsigned long nr_running(void) - - return sum; - } -+EXPORT_SYMBOL(nr_running); - - unsigned long nr_uninterruptible(void) - { -@@ -1635,6 +1866,8 @@ unsigned long nr_uninterruptible(void) - return sum; - } - -+EXPORT_SYMBOL(nr_uninterruptible); -+ - unsigned long long nr_context_switches(void) - { - unsigned long long i, sum = 0; -@@ -1645,6 +1878,8 @@ unsigned long long nr_context_switches(v - return sum; - } - -+EXPORT_SYMBOL(nr_context_switches); -+ - unsigned long nr_iowait(void) - { - unsigned long i, sum = 0; -@@ -1655,11 +1890,87 @@ unsigned long nr_iowait(void) - return sum; - } - -+EXPORT_SYMBOL(nr_iowait); -+ -+unsigned long nr_stopped(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_cpu(i) -+ sum += cpu_rq(i)->nr_stopped; -+ -+ return sum; -+} -+ -+EXPORT_SYMBOL(nr_stopped); -+ -+unsigned long nr_sleeping(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_cpu(i) -+ sum += cpu_rq(i)->nr_sleeping; -+ -+ return sum; -+} -+ -+EXPORT_SYMBOL(nr_sleeping); -+ -+#ifdef CONFIG_VE -+unsigned long nr_running_ve(struct ve_struct *ve) -+{ -+ int i; -+ long sum; -+ cpumask_t ve_cpus; -+ -+ sum = 0; -+ ve_cpu_online_map(ve, &ve_cpus); -+ for_each_cpu_mask(i, ve_cpus) -+ sum += VE_CPU_STATS(ve, i)->nr_running; -+ return (unsigned long)(sum < 0 ? 0 : sum); -+} -+ -+EXPORT_SYMBOL(nr_running_ve); -+ -+unsigned long nr_uninterruptible_ve(struct ve_struct *ve) -+{ -+ int i; -+ long sum; -+ cpumask_t ve_cpus; -+ -+ sum = 0; -+ ve_cpu_online_map(ve, &ve_cpus); -+ for_each_cpu_mask(i, ve_cpus) -+ sum += VE_CPU_STATS(ve, i)->nr_unint; -+ return (unsigned long)(sum < 0 ? 0 : sum); -+} -+ -+EXPORT_SYMBOL(nr_uninterruptible_ve); -+ -+unsigned long nr_iowait_ve(struct ve_struct *ve) -+{ -+ int i; -+ long sum; -+ cpumask_t ve_cpus; -+ -+ sum = 0; -+ ve_cpu_online_map(ve, &ve_cpus); -+ for_each_cpu_mask(i, ve_cpus) -+ sum += VE_CPU_STATS(ve, i)->nr_iowait; -+ return (unsigned long)(sum < 0 ? 0 : sum); -+} -+ -+EXPORT_SYMBOL(nr_iowait_ve); -+#endif -+ - #ifdef CONFIG_SMP - - /* - * double_rq_lock - safely lock two runqueues - * -+ * We must take them in cpu order to match code in -+ * dependent_sleeper and wake_dependent_sleeper. -+ * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -@@ -1671,7 +1982,7 @@ static void double_rq_lock(runqueue_t *r - spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { -- if (rq1 < rq2) { -+ if (rq1->cpu < rq2->cpu) { - spin_lock(&rq1->lock); - spin_lock(&rq2->lock); - } else { -@@ -1707,7 +2018,7 @@ static void double_lock_balance(runqueue - __acquires(this_rq->lock) - { - if (unlikely(!spin_trylock(&busiest->lock))) { -- if (busiest < this_rq) { -+ if (busiest->cpu < this_rq->cpu) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock(&this_rq->lock); -@@ -1769,10 +2080,18 @@ static - void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) - { -+ struct ve_struct *ve; -+ cycles_t cycles; -+ -+ cycles = get_cycles(); -+ ve = VE_TASK_INFO(p)->owner_env; -+ - dequeue_task(p, src_array); - src_rq->nr_running--; -+ ve_nr_running_dec(ve, task_cpu(p), cycles); - set_task_cpu(p, this_cpu); - this_rq->nr_running++; -+ ve_nr_running_inc(ve, task_cpu(p), cycles); - enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; -@@ -2476,6 +2795,15 @@ unsigned long long current_sched_time(co - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) - -+#ifdef CONFIG_VE -+#define update_ve_cpu_time(p, time, tick) do { \ -+ VE_CPU_STATS((p)->ve_task_info.owner_env, \ -+ task_cpu(p))->time += tick; \ -+ } while (0) -+#else -+#define update_ve_cpu_time(p, time, tick) do { } while (0) -+#endif -+ - /* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to -@@ -2491,10 +2819,13 @@ void account_user_time(struct task_struc - - /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); -- if (TASK_NICE(p) > 0) -+ if (TASK_NICE(p) > 0) { - cpustat->nice = cputime64_add(cpustat->nice, tmp); -- else -+ update_ve_cpu_time(p, nice, tmp); -+ } else { - cpustat->user = cputime64_add(cpustat->user, tmp); -+ update_ve_cpu_time(p, user, tmp); -+ } - } - - /* -@@ -2511,9 +2842,11 @@ void account_system_time(struct task_str - cputime64_t tmp; - - p->stime = cputime_add(p->stime, cputime); -+ tmp = cputime_to_cputime64(cputime); -+ -+ update_ve_cpu_time(p, system, tmp); - - /* Add system time to cpustat. */ -- tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) -@@ -3015,11 +3348,30 @@ switch_tasks: - - sched_info_switch(prev, next); - if (likely(prev != next)) { -+ cycles_t cycles; -+ -+ cycles = get_cycles(); - next->timestamp = now; - rq->nr_switches++; - rq->curr = next; - ++*switch_count; - -+#ifdef CONFIG_VE -+ prev->ve_task_info.sleep_stamp = cycles; -+ if (prev->state == TASK_RUNNING && prev != this_rq()->idle) -+ write_wakeup_stamp(prev, cycles); -+ update_sched_lat(next, cycles); -+ -+ /* because next & prev are protected with -+ * runqueue lock we may not worry about -+ * wakeup_stamp and sched_time protection -+ * (same thing in 'else' branch below) -+ */ -+ update_ve_task_info(prev, cycles); -+ next->ve_task_info.sched_time = cycles; -+ write_wakeup_stamp(next, 0); -+#endif -+ - prepare_task_switch(rq, next); - prev = context_switch(rq, prev, next); - barrier(); -@@ -3029,8 +3381,10 @@ switch_tasks: - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); -- } else -+ } else { -+ update_ve_task_info(prev, get_cycles()); - spin_unlock_irq(&rq->lock); -+ } - - prev = current; - if (unlikely(reacquire_kernel_lock(prev) < 0)) -@@ -3593,7 +3947,7 @@ task_t *idle_task(int cpu) - */ - static inline task_t *find_process_by_pid(pid_t pid) - { -- return pid ? find_task_by_pid(pid) : current; -+ return pid ? find_task_by_pid_ve(pid) : current; - } - - /* Actually do priority change: must hold rq lock. */ -@@ -3653,7 +4007,7 @@ recheck: - /* - * Allow unprivileged RT tasks to decrease priority: - */ -- if (!capable(CAP_SYS_NICE)) { -+ if (!capable(CAP_SYS_ADMIN)) { - /* - * can't change policy, except between SCHED_NORMAL - * and SCHED_BATCH: -@@ -4112,8 +4466,15 @@ void __sched io_schedule(void) - { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); - -+#ifdef CONFIG_VE -+ struct ve_struct *ve; -+ ve = current->ve_task_info.owner_env; -+#endif -+ - atomic_inc(&rq->nr_iowait); -+ ve_nr_iowait_inc(ve, smp_processor_id()); - schedule(); -+ ve_nr_iowait_dec(ve, smp_processor_id()); - atomic_dec(&rq->nr_iowait); - } - -@@ -4124,8 +4485,15 @@ long __sched io_schedule_timeout(long ti - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); - long ret; - -+#ifdef CONFIG_VE -+ struct ve_struct *ve; -+ ve = current->ve_task_info.owner_env; -+#endif -+ - atomic_inc(&rq->nr_iowait); -+ ve_nr_iowait_inc(ve, smp_processor_id()); - ret = schedule_timeout(timeout); -+ ve_nr_iowait_dec(ve, smp_processor_id()); - atomic_dec(&rq->nr_iowait); - return ret; - } -@@ -4248,15 +4616,9 @@ static void show_task(task_t *p) - else - printk("?"); - #if (BITS_PER_LONG == 32) -- if (state == TASK_RUNNING) -- printk(" running "); -- else -- printk(" %08lX ", thread_saved_pc(p)); -+ printk(" %08lX ", (unsigned long)p); - #else -- if (state == TASK_RUNNING) -- printk(" running task "); -- else -- printk(" %016lx ", thread_saved_pc(p)); -+ printk(" %016lx ", (unsigned long)p); - #endif - #ifdef CONFIG_DEBUG_STACK_USAGE - { -@@ -4295,21 +4657,21 @@ void show_state(void) - #if (BITS_PER_LONG == 32) - printk("\n" - " sibling\n"); -- printk(" task PC pid father child younger older\n"); -+ printk(" task taskaddr pid father child younger older\n"); - #else - printk("\n" - " sibling\n"); -- printk(" task PC pid father child younger older\n"); -+ printk(" task taskaddr pid father child younger older\n"); - #endif - read_lock(&tasklist_lock); -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: - */ - touch_nmi_watchdog(); - show_task(p); -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - - read_unlock(&tasklist_lock); - mutex_debug_show_all_locks(); -@@ -4590,13 +4952,13 @@ static void migrate_live_tasks(int src_c - - write_lock_irq(&tasklist_lock); - -- do_each_thread(t, tsk) { -+ do_each_thread_all(t, tsk) { - if (tsk == current) - continue; - - if (task_cpu(tsk) == src_cpu) - move_task_off_dead_cpu(src_cpu, tsk); -- } while_each_thread(t, tsk); -+ } while_each_thread_all(t, tsk); - - write_unlock_irq(&tasklist_lock); - } -@@ -6035,6 +6397,7 @@ void __init sched_init(void) - rq->push_cpu = 0; - rq->migration_thread = NULL; - INIT_LIST_HEAD(&rq->migration_queue); -+ rq->cpu = i; - #endif - atomic_set(&rq->nr_iowait, 0); - -@@ -6095,7 +6458,7 @@ void normalize_rt_tasks(void) - runqueue_t *rq; - - read_lock_irq(&tasklist_lock); -- for_each_process (p) { -+ for_each_process_all (p) { - if (!rt_task(p)) - continue; - -diff -upr linux-2.6.16.orig/kernel/signal.c linux-2.6.16-026test009/kernel/signal.c ---- linux-2.6.16.orig/kernel/signal.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/signal.c 2006-04-19 15:02:12.000000000 +0400 -@@ -25,17 +25,20 @@ - #include <linux/posix-timers.h> - #include <linux/signal.h> - #include <linux/audit.h> -+#include <linux/kmem_cache.h> - #include <linux/capability.h> - #include <asm/param.h> - #include <asm/uaccess.h> - #include <asm/unistd.h> - #include <asm/siginfo.h> -+#include <ub/ub_misc.h> - - /* - * SLAB caches for signal bits. - */ - --static kmem_cache_t *sigqueue_cachep; -+kmem_cache_t *sigqueue_cachep; -+EXPORT_SYMBOL_GPL(sigqueue_cachep); - - /* - * In POSIX a signal is sent either to a specific thread (Linux task) -@@ -221,6 +224,7 @@ fastcall void recalc_sigpending_tsk(stru - else - clear_tsk_thread_flag(t, TIF_SIGPENDING); - } -+EXPORT_SYMBOL_GPL(recalc_sigpending_tsk); - - void recalc_sigpending(void) - { -@@ -271,8 +275,13 @@ static struct sigqueue *__sigqueue_alloc - atomic_inc(&t->user->sigpending); - if (override_rlimit || - atomic_read(&t->user->sigpending) <= -- t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) -+ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { - q = kmem_cache_alloc(sigqueue_cachep, flags); -+ if (q && ub_siginfo_charge(q, get_task_ub(t))) { -+ kmem_cache_free(sigqueue_cachep, q); -+ q = NULL; -+ } -+ } - if (unlikely(q == NULL)) { - atomic_dec(&t->user->sigpending); - } else { -@@ -289,6 +298,7 @@ static void __sigqueue_free(struct sigqu - return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); -+ ub_siginfo_uncharge(q); - kmem_cache_free(sigqueue_cachep, q); - } - -@@ -524,7 +534,16 @@ static int __dequeue_signal(struct sigpe - { - int sig = 0; - -- sig = next_signal(pending, mask); -+ /* SIGKILL must have priority, otherwise it is quite easy -+ * to create an unkillable process, sending sig < SIGKILL -+ * to self */ -+ if (unlikely(sigismember(&pending->signal, SIGKILL))) { -+ if (!sigismember(mask, SIGKILL)) -+ sig = SIGKILL; -+ } -+ -+ if (likely(!sig)) -+ sig = next_signal(pending, mask); - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { -@@ -618,6 +637,7 @@ void signal_wake_up(struct task_struct * - if (!wake_up_state(t, mask)) - kick_process(t); - } -+EXPORT_SYMBOL_GPL(signal_wake_up); - - /* - * Remove signals in mask from the pending set and queue. -@@ -838,7 +858,7 @@ static int send_signal(int sig, struct s - q->info.si_signo = sig; - q->info.si_errno = 0; - q->info.si_code = SI_USER; -- q->info.si_pid = current->pid; -+ q->info.si_pid = virt_pid(current); - q->info.si_uid = current->uid; - break; - case (unsigned long) SEND_SIG_PRIV: -@@ -975,7 +995,6 @@ __group_complete_signal(int sig, struct - if (t == NULL) - /* restart balancing at this thread */ - t = p->signal->curr_target = p; -- BUG_ON(t->tgid != p->tgid); - - while (!wants_signal(sig, t)) { - t = next_thread(t); -@@ -1159,13 +1178,18 @@ int __kill_pg_info(int sig, struct sigin - if (pgrp <= 0) - return -EINVAL; - -+ /* Use __vpid_to_pid(). This function is used under write_lock -+ * tasklist_lock. */ -+ if (is_virtual_pid(pgrp)) -+ pgrp = __vpid_to_pid(pgrp); -+ - success = 0; - retval = -ESRCH; -- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { -+ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { - int err = group_send_sig_info(sig, info, p); - success |= !err; - retval = err; -- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); -+ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); - return success ? 0 : retval; - } - -@@ -1193,7 +1217,7 @@ kill_proc_info(int sig, struct siginfo * - read_lock(&tasklist_lock); - acquired_tasklist_lock = 1; - } -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - error = -ESRCH; - if (p) - error = group_send_sig_info(sig, info, p); -@@ -1214,7 +1238,7 @@ int kill_proc_info_as_uid(int sig, struc - return ret; - - read_lock(&tasklist_lock); -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - if (!p) { - ret = -ESRCH; - goto out_unlock; -@@ -1253,8 +1277,8 @@ static int kill_something_info(int sig, - struct task_struct * p; - - read_lock(&tasklist_lock); -- for_each_process(p) { -- if (p->pid > 1 && p->tgid != current->tgid) { -+ for_each_process_ve(p) { -+ if (virt_pid(p) > 1 && p->tgid != current->tgid) { - int err = group_send_sig_info(sig, info, p); - ++count; - if (err != -EPERM) -@@ -1562,9 +1586,17 @@ void do_notify_parent(struct task_struct - BUG_ON(!tsk->ptrace && - (tsk->group_leader != tsk || !thread_group_empty(tsk))); - -+#ifdef CONFIG_VE -+ /* Allow to send only SIGCHLD from VE */ -+ if (sig != SIGCHLD && -+ tsk->ve_task_info.owner_env != -+ tsk->parent->ve_task_info.owner_env) -+ sig = SIGCHLD; -+#endif -+ - info.si_signo = sig; - info.si_errno = 0; -- info.si_pid = tsk->pid; -+ info.si_pid = get_task_pid_ve(tsk, tsk->parent->ve_task_info.owner_env); - info.si_uid = tsk->uid; - - /* FIXME: find out whether or not this is supposed to be c*time. */ -@@ -1629,7 +1661,7 @@ static void do_notify_parent_cldstop(str - - info.si_signo = SIGCHLD; - info.si_errno = 0; -- info.si_pid = tsk->pid; -+ info.si_pid = get_task_pid_ve(tsk, VE_TASK_INFO(parent)->owner_env); - info.si_uid = tsk->uid; - - /* FIXME: find out whether or not this is supposed to be c*time. */ -@@ -1763,7 +1795,9 @@ finish_stop(int stop_count) - read_unlock(&tasklist_lock); - - out: -+ set_stop_state(current); - schedule(); -+ clear_stop_state(current); - /* - * Now we don't run again until continued. - */ -@@ -1940,11 +1974,13 @@ relock: - ptrace_signal_deliver(regs, cookie); - - /* Let the debugger run. */ -+ set_pn_state(current, PN_STOP_SIGNAL); - ptrace_stop(signr, signr, info); -+ clear_pn_state(current); - -- /* We're back. Did the debugger cancel the sig or group_exit? */ -+ /* We're back. Did the debugger cancel the sig? */ - signr = current->exit_code; -- if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) -+ if (signr == 0) - continue; - - current->exit_code = 0; -@@ -1957,7 +1993,7 @@ relock: - info->si_signo = signr; - info->si_errno = 0; - info->si_code = SI_USER; -- info->si_pid = current->parent->pid; -+ info->si_pid = virt_pid(current->parent); - info->si_uid = current->parent->uid; - } - -@@ -1988,8 +2024,14 @@ relock: - continue; - - /* Init gets no signals it doesn't want. */ -- if (current->pid == 1) -+ if (virt_pid(current) == 1) { -+ /* Allow SIGKILL for non-root VE */ -+#ifdef CONFIG_VE -+ if (current->pid == 1 || -+ signr != SIGKILL) -+#endif - continue; -+ } - - if (sig_kernel_stop(signr)) { - /* -@@ -2307,7 +2349,6 @@ sys_rt_sigtimedwait(const sigset_t __use - - timeout = schedule_timeout_interruptible(timeout); - -- try_to_freeze(); - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - current->blocked = current->real_blocked; -@@ -2340,7 +2381,7 @@ sys_kill(int pid, int sig) - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; -- info.si_pid = current->tgid; -+ info.si_pid = virt_tgid(current); - info.si_uid = current->uid; - - return kill_something_info(sig, &info, pid); -@@ -2356,12 +2397,12 @@ static int do_tkill(int tgid, int pid, i - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; -- info.si_pid = current->tgid; -+ info.si_pid = virt_tgid(current); - info.si_uid = current->uid; - - read_lock(&tasklist_lock); -- p = find_task_by_pid(pid); -- if (p && (tgid <= 0 || p->tgid == tgid)) { -+ p = find_task_by_pid_ve(pid); -+ if (p && (tgid <= 0 || virt_tgid(p) == tgid)) { - error = check_kill_permission(sig, &info, p); - /* - * The null signal is a permissions and process existence -diff -upr linux-2.6.16.orig/kernel/softirq.c linux-2.6.16-026test009/kernel/softirq.c ---- linux-2.6.16.orig/kernel/softirq.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/softirq.c 2006-04-19 15:02:12.000000000 +0400 -@@ -17,6 +17,8 @@ - #include <linux/kthread.h> - #include <linux/rcupdate.h> - -+#include <ub/beancounter.h> -+ - #include <asm/irq.h> - /* - - No shared variables, all the data are CPU local. -@@ -73,10 +75,14 @@ static inline void wakeup_softirqd(void) - - asmlinkage void __do_softirq(void) - { -+ struct user_beancounter *ub; - struct softirq_action *h; - __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; - int cpu; -+ struct ve_struct *envid; -+ -+ envid = set_exec_env(get_ve0()); - - pending = local_softirq_pending(); - -@@ -90,6 +96,7 @@ restart: - - h = softirq_vec; - -+ ub = set_exec_ub(get_ub0()); - do { - if (pending & 1) { - h->action(h); -@@ -98,6 +105,7 @@ restart: - h++; - pending >>= 1; - } while (pending); -+ (void)set_exec_ub(ub); - - local_irq_disable(); - -@@ -108,6 +116,7 @@ restart: - if (pending) - wakeup_softirqd(); - -+ (void)set_exec_env(envid); - __local_bh_enable(); - } - -diff -upr linux-2.6.16.orig/kernel/sys.c linux-2.6.16-026test009/kernel/sys.c ---- linux-2.6.16.orig/kernel/sys.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/sys.c 2006-04-19 15:02:12.000000000 +0400 -@@ -11,6 +11,7 @@ - #include <linux/mman.h> - #include <linux/smp_lock.h> - #include <linux/notifier.h> -+#include <linux/virtinfo.h> - #include <linux/reboot.h> - #include <linux/prctl.h> - #include <linux/init.h> -@@ -236,6 +237,94 @@ int capable(int cap) - EXPORT_SYMBOL(capable); - #endif - -+static DECLARE_MUTEX(virtinfo_sem); -+static struct vnotifier_block *virtinfo_chain[VIRT_TYPES]; -+ -+void virtinfo_notifier_register(int type, struct vnotifier_block *nb) -+{ -+ struct vnotifier_block **p; -+ -+ down(&virtinfo_sem); -+ for (p = &virtinfo_chain[type]; -+ *p != NULL && nb->priority < (*p)->priority; -+ p = &(*p)->next); -+ nb->next = *p; -+ smp_wmb(); -+ *p = nb; -+ up(&virtinfo_sem); -+} -+ -+EXPORT_SYMBOL(virtinfo_notifier_register); -+ -+struct virtinfo_cnt_struct { -+ volatile unsigned long exit[NR_CPUS]; -+ volatile unsigned long entry; -+}; -+static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt); -+ -+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb) -+{ -+ struct vnotifier_block **p; -+ int entry_cpu, exit_cpu; -+ unsigned long cnt, ent; -+ -+ down(&virtinfo_sem); -+ for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next); -+ *p = nb->next; -+ smp_mb(); -+ -+ for_each_cpu_mask(entry_cpu, cpu_possible_map) { -+ while (1) { -+ cnt = 0; -+ for_each_cpu_mask(exit_cpu, cpu_possible_map) -+ cnt += -+ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]; -+ smp_rmb(); -+ ent = per_cpu(virtcnt, entry_cpu).entry; -+ if (cnt == ent) -+ break; -+ __set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_timeout(HZ / 100); -+ } -+ } -+ up(&virtinfo_sem); -+} -+ -+EXPORT_SYMBOL(virtinfo_notifier_unregister); -+ -+int virtinfo_notifier_call(int type, unsigned long n, void *data) -+{ -+ int ret; -+ int entry_cpu, exit_cpu; -+ struct vnotifier_block *nb; -+ -+ entry_cpu = get_cpu(); -+ per_cpu(virtcnt, entry_cpu).entry++; -+ smp_wmb(); -+ put_cpu(); -+ -+ nb = virtinfo_chain[type]; -+ ret = NOTIFY_DONE; -+ while (nb) -+ { -+ ret = nb->notifier_call(nb, n, data, ret); -+ if(ret & NOTIFY_STOP_MASK) { -+ ret &= ~NOTIFY_STOP_MASK; -+ break; -+ } -+ nb = nb->next; -+ } -+ -+ exit_cpu = get_cpu(); -+ smp_wmb(); -+ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++; -+ put_cpu(); -+ -+ return ret; -+} -+ -+EXPORT_SYMBOL(virtinfo_notifier_call); -+ - static int set_one_prio(struct task_struct *p, int niceval, int error) - { - int no_nice; -@@ -281,17 +370,19 @@ asmlinkage long sys_setpriority(int whic - switch (which) { - case PRIO_PROCESS: - if (!who) -- who = current->pid; -- p = find_task_by_pid(who); -+ who = virt_pid(current); -+ p = find_task_by_pid_ve(who); - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (!who) - who = process_group(current); -- do_each_task_pid(who, PIDTYPE_PGID, p) { -+ else -+ who = vpid_to_pid(who); -+ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); -- } while_each_task_pid(who, PIDTYPE_PGID, p); -+ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); - break; - case PRIO_USER: - user = current->user; -@@ -301,10 +392,10 @@ asmlinkage long sys_setpriority(int whic - if ((who != current->uid) && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - -- do_each_thread(g, p) -+ do_each_thread_ve(g, p) - if (p->uid == who) - error = set_one_prio(p, niceval, error); -- while_each_thread(g, p); -+ while_each_thread_ve(g, p); - if (who != current->uid) - free_uid(user); /* For find_user() */ - break; -@@ -334,8 +425,8 @@ asmlinkage long sys_getpriority(int whic - switch (which) { - case PRIO_PROCESS: - if (!who) -- who = current->pid; -- p = find_task_by_pid(who); -+ who = virt_pid(current); -+ p = find_task_by_pid_ve(who); - if (p) { - niceval = 20 - task_nice(p); - if (niceval > retval) -@@ -345,11 +436,13 @@ asmlinkage long sys_getpriority(int whic - case PRIO_PGRP: - if (!who) - who = process_group(current); -- do_each_task_pid(who, PIDTYPE_PGID, p) { -+ else -+ who = vpid_to_pid(who); -+ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; -- } while_each_task_pid(who, PIDTYPE_PGID, p); -+ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); - break; - case PRIO_USER: - user = current->user; -@@ -359,13 +452,13 @@ asmlinkage long sys_getpriority(int whic - if ((who != current->uid) && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - -- do_each_thread(g, p) -+ do_each_thread_ve(g, p) - if (p->uid == who) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } -- while_each_thread(g, p); -+ while_each_thread_ve(g, p); - if (who != current->uid) - free_uid(user); /* for find_user() */ - break; -@@ -497,6 +590,35 @@ asmlinkage long sys_reboot(int magic1, i - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - -+#ifdef CONFIG_VE -+ if (!ve_is_super(get_exec_env())) -+ switch (cmd) { -+ case LINUX_REBOOT_CMD_RESTART: -+ case LINUX_REBOOT_CMD_HALT: -+ case LINUX_REBOOT_CMD_POWER_OFF: -+ case LINUX_REBOOT_CMD_RESTART2: { -+ struct siginfo info; -+ -+ info.si_errno = 0; -+ info.si_code = SI_KERNEL; -+ info.si_pid = virt_pid(current); -+ info.si_uid = current->uid; -+ info.si_signo = SIGKILL; -+ -+ /* Sending to real init is safe */ -+ send_sig_info(SIGKILL, &info, -+ get_exec_env()->init_entry); -+ } -+ -+ case LINUX_REBOOT_CMD_CAD_ON: -+ case LINUX_REBOOT_CMD_CAD_OFF: -+ return 0; -+ -+ default: -+ return -EINVAL; -+ } -+#endif -+ - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ -@@ -686,7 +808,7 @@ asmlinkage long sys_setgid(gid_t gid) - return 0; - } - --static int set_user(uid_t new_ruid, int dumpclear) -+int set_user(uid_t new_ruid, int dumpclear) - { - struct user_struct *new_user; - -@@ -711,6 +833,7 @@ static int set_user(uid_t new_ruid, int - current->uid = new_ruid; - return 0; - } -+EXPORT_SYMBOL(set_user); - - /* - * Unprivileged users may change the real uid to the effective uid -@@ -1079,7 +1202,12 @@ asmlinkage long sys_times(struct tms __u - if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) - return -EFAULT; - } -+#ifndef CONFIG_VE - return (long) jiffies_64_to_clock_t(get_jiffies_64()); -+#else -+ return (long) jiffies_64_to_clock_t(get_jiffies_64() - -+ get_exec_env()->start_jiffies); -+#endif - } - - /* -@@ -1100,21 +1228,24 @@ asmlinkage long sys_setpgid(pid_t pid, p - struct task_struct *p; - struct task_struct *group_leader = current->group_leader; - int err = -EINVAL; -+ int _pgid; - - if (!pid) -- pid = group_leader->pid; -+ pid = virt_pid(group_leader); - if (!pgid) - pgid = pid; - if (pgid < 0) - return -EINVAL; - -+ _pgid = vpid_to_pid(pgid); -+ - /* From this point forward we keep holding onto the tasklist lock - * so that our parent does not change from under us. -DaveM - */ - write_lock_irq(&tasklist_lock); - - err = -ESRCH; -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - if (!p) - goto out; - -@@ -1139,25 +1270,35 @@ asmlinkage long sys_setpgid(pid_t pid, p - if (p->signal->leader) - goto out; - -- if (pgid != pid) { -+ pgid = virt_pid(p); -+ if (_pgid != p->pid) { - struct task_struct *p; - -- do_each_task_pid(pgid, PIDTYPE_PGID, p) { -- if (p->signal->session == group_leader->signal->session) -+ do_each_task_pid_ve(_pgid, PIDTYPE_PGID, p) { -+ if (p->signal->session == group_leader->signal->session) { -+ pgid = virt_pgid(p); - goto ok_pgid; -- } while_each_task_pid(pgid, PIDTYPE_PGID, p); -+ } -+ } while_each_task_pid_ve(_pgid, PIDTYPE_PGID, p); - goto out; - } - - ok_pgid: -- err = security_task_setpgid(p, pgid); -+ err = security_task_setpgid(p, _pgid); - if (err) - goto out; - - if (process_group(p) != pgid) { - detach_pid(p, PIDTYPE_PGID); -- p->signal->pgrp = pgid; -- attach_pid(p, PIDTYPE_PGID, pgid); -+ p->signal->pgrp = _pgid; -+ set_virt_pgid(p, pgid); -+ attach_pid(p, PIDTYPE_PGID, _pgid); -+ if (atomic_read(&p->signal->count) != 1) { -+ task_t *t; -+ for (t = next_thread(p); t != p; t = next_thread(t)) { -+ set_virt_pgid(t, pgid); -+ } -+ } - } - - err = 0; -@@ -1170,19 +1311,19 @@ out: - asmlinkage long sys_getpgid(pid_t pid) - { - if (!pid) { -- return process_group(current); -+ return virt_pgid(current); - } else { - int retval; - struct task_struct *p; - - read_lock(&tasklist_lock); -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - - retval = -ESRCH; - if (p) { - retval = security_task_getpgid(p); - if (!retval) -- retval = process_group(p); -+ retval = virt_pgid(p); - } - read_unlock(&tasklist_lock); - return retval; -@@ -1194,7 +1335,7 @@ asmlinkage long sys_getpgid(pid_t pid) - asmlinkage long sys_getpgrp(void) - { - /* SMP - assuming writes are word atomic this is fine */ -- return process_group(current); -+ return virt_pgid(current); - } - - #endif -@@ -1202,19 +1343,19 @@ asmlinkage long sys_getpgrp(void) - asmlinkage long sys_getsid(pid_t pid) - { - if (!pid) { -- return current->signal->session; -+ return virt_sid(current); - } else { - int retval; - struct task_struct *p; - - read_lock(&tasklist_lock); -- p = find_task_by_pid(pid); -+ p = find_task_by_pid_ve(pid); - - retval = -ESRCH; - if(p) { - retval = security_task_getsid(p); - if (!retval) -- retval = p->signal->session; -+ retval = virt_sid(p); - } - read_unlock(&tasklist_lock); - return retval; -@@ -1236,9 +1377,20 @@ asmlinkage long sys_setsid(void) - - group_leader->signal->leader = 1; - __set_special_pids(group_leader->pid, group_leader->pid); -+ set_virt_pgid(group_leader, virt_pid(group_leader)); -+ set_virt_sid(group_leader, virt_pid(group_leader)); - group_leader->signal->tty = NULL; - group_leader->signal->tty_old_pgrp = 0; -- err = process_group(group_leader); -+ if (atomic_read(&group_leader->signal->count) != 1) { -+ task_t *t; -+ for (t = next_thread(group_leader); t != group_leader; -+ t = next_thread(t)) { -+ set_virt_pgid(t, virt_pid(group_leader)); -+ set_virt_sid(t, virt_pid(group_leader)); -+ } -+ } -+ -+ err = virt_pgid(group_leader); - out: - write_unlock_irq(&tasklist_lock); - up(&tty_sem); -@@ -1518,7 +1670,7 @@ asmlinkage long sys_newuname(struct new_ - int errno = 0; - - down_read(&uts_sem); -- if (copy_to_user(name,&system_utsname,sizeof *name)) -+ if (copy_to_user(name,&ve_utsname,sizeof *name)) - errno = -EFAULT; - up_read(&uts_sem); - return errno; -@@ -1529,15 +1681,15 @@ asmlinkage long sys_sethostname(char __u - int errno; - char tmp[__NEW_UTS_LEN]; - -- if (!capable(CAP_SYS_ADMIN)) -+ if (!capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - if (len < 0 || len > __NEW_UTS_LEN) - return -EINVAL; - down_write(&uts_sem); - errno = -EFAULT; - if (!copy_from_user(tmp, name, len)) { -- memcpy(system_utsname.nodename, tmp, len); -- system_utsname.nodename[len] = 0; -+ memcpy(ve_utsname.nodename, tmp, len); -+ ve_utsname.nodename[len] = 0; - errno = 0; - } - up_write(&uts_sem); -@@ -1553,11 +1705,11 @@ asmlinkage long sys_gethostname(char __u - if (len < 0) - return -EINVAL; - down_read(&uts_sem); -- i = 1 + strlen(system_utsname.nodename); -+ i = 1 + strlen(ve_utsname.nodename); - if (i > len) - i = len; - errno = 0; -- if (copy_to_user(name, system_utsname.nodename, i)) -+ if (copy_to_user(name, ve_utsname.nodename, i)) - errno = -EFAULT; - up_read(&uts_sem); - return errno; -@@ -1574,7 +1726,7 @@ asmlinkage long sys_setdomainname(char _ - int errno; - char tmp[__NEW_UTS_LEN]; - -- if (!capable(CAP_SYS_ADMIN)) -+ if (!capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - if (len < 0 || len > __NEW_UTS_LEN) - return -EINVAL; -@@ -1582,8 +1734,8 @@ asmlinkage long sys_setdomainname(char _ - down_write(&uts_sem); - errno = -EFAULT; - if (!copy_from_user(tmp, name, len)) { -- memcpy(system_utsname.domainname, tmp, len); -- system_utsname.domainname[len] = 0; -+ memcpy(ve_utsname.domainname, tmp, len); -+ ve_utsname.domainname[len] = 0; - errno = 0; - } - up_write(&uts_sem); -@@ -1657,7 +1809,19 @@ asmlinkage long sys_setrlimit(unsigned i - (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - new_rlim.rlim_cur <= cputime_to_secs( - current->signal->it_prof_expires))) { -- cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); -+ unsigned long rlim_cur = new_rlim.rlim_cur; -+ cputime_t cputime; -+ -+ if (rlim_cur == 0) { -+ /* -+ * The caller is asking for an immediate RLIMIT_CPU -+ * expiry. But we use the zero value to mean "it was -+ * never set". So let's cheat and make it one second -+ * instead -+ */ -+ rlim_cur = 1; -+ } -+ cputime = secs_to_cputime(rlim_cur); - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, -diff -upr linux-2.6.16.orig/kernel/sysctl.c linux-2.6.16-026test009/kernel/sysctl.c ---- linux-2.6.16.orig/kernel/sysctl.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/sysctl.c 2006-04-19 15:02:12.000000000 +0400 -@@ -25,6 +25,8 @@ - #include <linux/slab.h> - #include <linux/sysctl.h> - #include <linux/proc_fs.h> -+#include <linux/ve_owner.h> -+#include <linux/ve.h> - #include <linux/capability.h> - #include <linux/ctype.h> - #include <linux/utsname.h> -@@ -63,6 +65,7 @@ extern int max_threads; - extern int sysrq_enabled; - extern int core_uses_pid; - extern int suid_dumpable; -+extern int sysctl_at_vsyscall; - extern char core_pattern[]; - extern int cad_pid; - extern int pid_max; -@@ -72,6 +75,12 @@ extern int printk_ratelimit_burst; - extern int pid_max_min, pid_max_max; - extern int sysctl_drop_caches; - extern int percpu_pagelist_fraction; -+#ifdef CONFIG_VE -+int glob_virt_pids = 1; -+EXPORT_SYMBOL(glob_virt_pids); -+#endif -+ -+extern int ve_area_access_check; /* fs/namei.c */ - - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - int unknown_nmi_panic; -@@ -122,6 +131,7 @@ extern int spin_retry; - #endif - - extern int sysctl_hz_timer; -+int decode_call_traces; - - #ifdef CONFIG_BSD_PROCESS_ACCT - extern int acct_parm[]; -@@ -133,8 +143,6 @@ extern int no_unaligned_warning; - - static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, - ctl_table *, void **); --static int proc_doutsstring(ctl_table *table, int write, struct file *filp, -- void __user *buffer, size_t *lenp, loff_t *ppos); - - static ctl_table root_table[]; - static struct ctl_table_header root_table_header = -@@ -178,6 +186,8 @@ static void register_proc_table(ctl_tabl - static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); - #endif - -+extern struct new_utsname virt_utsname; -+ - /* The default sysctl tables: */ - - static ctl_table root_table[] = { -@@ -276,6 +286,15 @@ static ctl_table kern_table[] = { - .strategy = &sysctl_string, - }, - { -+ .ctl_name = KERN_VIRT_OSRELEASE, -+ .procname = "virt_osrelease", -+ .data = virt_utsname.release, -+ .maxlen = sizeof(virt_utsname.release), -+ .mode = 0644, -+ .proc_handler = &proc_doutsstring, -+ .strategy = &sysctl_string, -+ }, -+ { - .ctl_name = KERN_PANIC, - .procname = "panic", - .data = &panic_timeout, -@@ -590,6 +609,16 @@ static ctl_table kern_table[] = { - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, -+#ifdef CONFIG_VE -+ { -+ .ctl_name = KERN_VIRT_PIDS, -+ .procname = "virt_pids", -+ .data = &glob_virt_pids, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif - { - .ctl_name = KERN_PANIC_ON_OOPS, - .procname = "panic_on_oops", -@@ -1046,10 +1075,26 @@ static ctl_table fs_table[] = { - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -+ { -+ .ctl_name = FS_AT_VSYSCALL, -+ .procname = "vsyscall", -+ .data = &sysctl_at_vsyscall, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec -+ }, - { .ctl_name = 0 } - }; - - static ctl_table debug_table[] = { -+ { -+ .ctl_name = DBG_DECODE_CALLTRACES, -+ .procname = "decode_call_traces", -+ .data = &decode_call_traces, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec -+ }, - { .ctl_name = 0 } - }; - -@@ -1113,6 +1158,7 @@ int do_sysctl(int __user *name, int nlen - { - struct list_head *tmp; - int error = -ENOTDIR; -+ struct ve_struct *ve; - - if (nlen <= 0 || nlen >= CTL_MAXNAME) - return -ENOTDIR; -@@ -1121,13 +1167,24 @@ int do_sysctl(int __user *name, int nlen - if (!oldlenp || get_user(old_len, oldlenp)) - return -EFAULT; - } -+ ve = get_exec_env(); - spin_lock(&sysctl_lock); -+#ifdef CONFIG_VE -+ tmp = ve->sysctl_lh.next; -+#else - tmp = &root_table_header.ctl_entry; -+#endif - do { -- struct ctl_table_header *head = -- list_entry(tmp, struct ctl_table_header, ctl_entry); -+ struct ctl_table_header *head; - void *context = NULL; - -+#ifdef CONFIG_VE -+ if (tmp == &ve->sysctl_lh) -+ /* second pass over global variables */ -+ tmp = &root_table_header.ctl_entry; -+#endif -+ -+ head = list_entry(tmp, struct ctl_table_header, ctl_entry); - if (!use_table(head)) - continue; - -@@ -1181,10 +1238,14 @@ static int test_perm(int mode, int op) - static inline int ctl_perm(ctl_table *table, int op) - { - int error; -+ int mode = table->mode; -+ - error = security_sysctl(table, op); - if (error) - return error; -- return test_perm(table->mode, op); -+ if (!ve_accessible(table->owner_env, get_exec_env())) -+ mode &= ~0222; /* disable write access */ -+ return test_perm(mode, op); - } - - static int parse_table(int __user *name, int nlen, -@@ -1350,6 +1411,8 @@ struct ctl_table_header *register_sysctl - int insert_at_head) - { - struct ctl_table_header *tmp; -+ struct list_head *lh; -+ - tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); - if (!tmp) - return NULL; -@@ -1358,17 +1421,52 @@ struct ctl_table_header *register_sysctl - tmp->used = 0; - tmp->unregistering = NULL; - spin_lock(&sysctl_lock); -+#ifdef CONFIG_VE -+ lh = &get_exec_env()->sysctl_lh; -+#else -+ lh = &root_table_header.ctl_entry; -+#endif - if (insert_at_head) -- list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); -+ list_add(&tmp->ctl_entry, lh); - else -- list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); -+ list_add_tail(&tmp->ctl_entry, lh); - spin_unlock(&sysctl_lock); - #ifdef CONFIG_PROC_FS -+#ifdef CONFIG_VE -+ register_proc_table(table, get_exec_env()->proc_sys_root, tmp); -+#else - register_proc_table(table, proc_sys_root, tmp); - #endif -+#endif - return tmp; - } - -+void free_sysctl_clone(ctl_table *clone) -+{ -+ kfree(clone); -+} -+ -+ctl_table *clone_sysctl_template(ctl_table *tmpl, int nr) -+{ -+ int i; -+ ctl_table *clone; -+ -+ clone = kmalloc(nr * sizeof(ctl_table), GFP_KERNEL); -+ if (clone == NULL) -+ return NULL; -+ -+ memcpy(clone, tmpl, nr * sizeof(ctl_table)); -+ for (i = 0; i < nr; i++) { -+ if (tmpl[i].ctl_name == 0) -+ continue; -+ clone[i].owner_env = get_exec_env(); -+ if (tmpl[i].child == NULL) -+ continue; -+ clone[i].child = clone + (tmpl[i].child - tmpl); -+ } -+ return clone; -+} -+ - /** - * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table -@@ -1382,8 +1480,12 @@ void unregister_sysctl_table(struct ctl_ - spin_lock(&sysctl_lock); - start_unregistering(header); - #ifdef CONFIG_PROC_FS -+#ifdef CONFIG_VE -+ unregister_proc_table(header->ctl_table, get_exec_env()->proc_sys_root); -+#else - unregister_proc_table(header->ctl_table, proc_sys_root); - #endif -+#endif - spin_unlock(&sysctl_lock); - kfree(header); - } -@@ -1469,11 +1571,6 @@ static void unregister_proc_table(ctl_ta - * its fields. We are under sysctl_lock here. - */ - de->data = NULL; -- -- /* Don't unregister proc entries that are still being used.. */ -- if (atomic_read(&de->count)) -- continue; -- - table->de = NULL; - remove_proc_entry(table->procname, root); - } -@@ -1615,7 +1712,7 @@ int proc_dostring(ctl_table *table, int - * to observe. Should this be in kernel/sys.c ???? - */ - --static int proc_doutsstring(ctl_table *table, int write, struct file *filp, -+int proc_doutsstring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) - { - int r; -@@ -2190,7 +2287,7 @@ int proc_dostring(ctl_table *table, int - return -ENOSYS; - } - --static int proc_doutsstring(ctl_table *table, int write, struct file *filp, -+int proc_doutsstring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) - { - return -ENOSYS; -@@ -2494,6 +2591,14 @@ void unregister_sysctl_table(struct ctl_ - { - } - -+ctl_table * clone_sysctl_template(ctl_table *tmpl, int nr) -+{ -+ return NULL; -+} -+ -+void free_sysctl_clone(ctl_table *tmpl) -+{ -+} - #endif /* CONFIG_SYSCTL */ - - /* -@@ -2506,6 +2611,7 @@ EXPORT_SYMBOL(proc_dointvec_minmax); - EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); - EXPORT_SYMBOL(proc_dointvec_ms_jiffies); - EXPORT_SYMBOL(proc_dostring); -+EXPORT_SYMBOL(proc_doutsstring); - EXPORT_SYMBOL(proc_doulongvec_minmax); - EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); - EXPORT_SYMBOL(register_sysctl_table); -@@ -2514,3 +2620,5 @@ EXPORT_SYMBOL(sysctl_jiffies); - EXPORT_SYMBOL(sysctl_ms_jiffies); - EXPORT_SYMBOL(sysctl_string); - EXPORT_SYMBOL(unregister_sysctl_table); -+EXPORT_SYMBOL(clone_sysctl_template); -+EXPORT_SYMBOL(free_sysctl_clone); -diff -upr linux-2.6.16.orig/kernel/timer.c linux-2.6.16-026test009/kernel/timer.c ---- linux-2.6.16.orig/kernel/timer.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/timer.c 2006-04-19 15:02:12.000000000 +0400 -@@ -460,7 +460,11 @@ static inline void __run_timers(tvec_bas - spin_unlock_irq(&base->t_base.lock); - { - int preempt_count = preempt_count(); -+ struct ve_struct *ve; -+ -+ ve = set_exec_env(get_ve0()); - fn(data); -+ (void)set_exec_env(ve); - if (preempt_count != preempt_count()) { - printk(KERN_WARNING "huh, entered %p " - "with preempt_count %08x, exited" -@@ -868,6 +872,23 @@ EXPORT_SYMBOL(avenrun); - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -+ -+static void calc_load_ve(void) -+{ -+ unsigned long flags, nr_unint; -+ -+ nr_unint = nr_uninterruptible() * FIXED_1; -+ spin_lock_irqsave(&kstat_glb_lock, flags); -+ CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); -+ CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); -+ CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); -+ spin_unlock_irqrestore(&kstat_glb_lock, flags); -+ -+#ifdef CONFIG_VE -+ do_update_load_avg_ve(); -+#endif -+} -+ - static inline void calc_load(unsigned long ticks) - { - unsigned long active_tasks; /* fixed-point */ -@@ -880,6 +901,7 @@ static inline void calc_load(unsigned lo - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); -+ calc_load_ve(); - } - } - -@@ -990,7 +1012,7 @@ asmlinkage unsigned long sys_alarm(unsig - */ - asmlinkage long sys_getpid(void) - { -- return current->tgid; -+ return virt_tgid(current); - } - - /* -@@ -1012,12 +1034,13 @@ asmlinkage long sys_getpid(void) - asmlinkage long sys_getppid(void) - { - int pid; -+#ifndef CONFIG_DEBUG_SLAB - struct task_struct *me = current; - struct task_struct *parent; - - parent = me->group_leader->real_parent; - for (;;) { -- pid = parent->tgid; -+ pid = virt_tgid(parent); - #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) - { - struct task_struct *old = parent; -@@ -1034,6 +1057,16 @@ asmlinkage long sys_getppid(void) - #endif - break; - } -+#else -+ /* -+ * ->real_parent could be released before dereference and -+ * we accessed freed kernel memory, which faults with debugging on. -+ * Keep it simple and stupid. -+ */ -+ read_lock(&tasklist_lock); -+ pid = virt_tgid(current->group_leader->real_parent); -+ read_unlock(&tasklist_lock); -+#endif - return pid; - } - -@@ -1164,7 +1197,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterru - /* Thread ID - the internal kernel "pid" */ - asmlinkage long sys_gettid(void) - { -- return current->pid; -+ return virt_pid(current); - } - - /* -@@ -1176,11 +1209,12 @@ asmlinkage long sys_sysinfo(struct sysin - unsigned long mem_total, sav_total; - unsigned int mem_unit, bitcount; - unsigned long seq; -+ unsigned long *__avenrun; -+ struct timespec tp; - - memset((char *)&val, 0, sizeof(struct sysinfo)); - - do { -- struct timespec tp; - seq = read_seqbegin(&xtime_lock); - - /* -@@ -1197,14 +1231,25 @@ asmlinkage long sys_sysinfo(struct sysin - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } -- val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); -- -- val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); -- val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); -- val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); -+ } while (read_seqretry(&xtime_lock, seq)); - -+ if (ve_is_super(get_exec_env())) { -+ val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); -+ __avenrun = &avenrun[0]; - val.procs = nr_threads; -- } while (read_seqretry(&xtime_lock, seq)); -+ } -+#ifdef CONFIG_VE -+ else { -+ struct ve_struct *ve; -+ ve = get_exec_env(); -+ __avenrun = &ve->avenrun[0]; -+ val.procs = atomic_read(&ve->pcounter); -+ val.uptime = tp.tv_sec - ve->start_timespec.tv_sec; -+ } -+#endif -+ val.loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); -+ val.loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); -+ val.loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - - si_meminfo(&val); - si_swapinfo(&val); -diff -upr linux-2.6.16.orig/kernel/ub/Kconfig linux-2.6.16-026test009/kernel/ub/Kconfig ---- linux-2.6.16.orig/kernel/ub/Kconfig 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/Kconfig 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,89 @@ -+# -+# User resources part (UBC) -+# -+# Copyright (C) 2005 SWsoft -+# All rights reserved. -+# -+# Licensing governed by "linux/COPYING.SWsoft" file. -+ -+menu "User resources" -+ -+config USER_RESOURCE -+ bool "Enable user resource accounting" -+ default y -+ help -+ This patch provides accounting and allows to configure -+ limits for user's consumption of exhaustible system resources. -+ The most important resource controlled by this patch is unswappable -+ memory (either mlock'ed or used by internal kernel structures and -+ buffers). The main goal of this patch is to protect processes -+ from running short of important resources because of an accidental -+ misbehavior of processes or malicious activity aiming to ``kill'' -+ the system. It's worth to mention that resource limits configured -+ by setrlimit(2) do not give an acceptable level of protection -+ because they cover only small fraction of resources and work on a -+ per-process basis. Per-process accounting doesn't prevent malicious -+ users from spawning a lot of resource-consuming processes. -+ -+config USER_RSS_ACCOUNTING -+ bool "Account physical memory usage" -+ default y -+ depends on USER_RESOURCE -+ help -+ This allows to estimate per beancounter physical memory usage. -+ Implemented alghorithm accounts shared pages of memory as well, -+ dividing them by number of beancounter which use the page. -+ -+config USER_SWAP_ACCOUNTING -+ bool "Account swap usage" -+ default y -+ depends on USER_RESOURCE -+ help -+ This allows accounting of swap usage. -+ -+config USER_RESOURCE_PROC -+ bool "Report resource usage in /proc" -+ default y -+ depends on USER_RESOURCE -+ help -+ Allows a system administrator to inspect resource accounts and limits. -+ -+config UBC_DEBUG -+ bool "User resources debug features" -+ default n -+ depends on USER_RESOURCE -+ help -+ Enables to setup debug features for user resource accounting -+ -+config UBC_DEBUG_KMEM -+ bool "Debug kmemsize with cache counters" -+ default n -+ depends on UBC_DEBUG -+ help -+ Adds /proc/user_beancounters_debug entry to get statistics -+ about cache usage of each beancounter -+ -+config UBC_KEEP_UNUSED -+ bool "Keep unused beancounter alive" -+ default y -+ depends on UBC_DEBUG -+ help -+ If on, unused beancounters are kept on the hash and maxheld value -+ can be looked through. -+ -+config UBC_DEBUG_ITEMS -+ bool "Account resources in items rather than in bytes" -+ default y -+ depends on UBC_DEBUG -+ help -+ When true some of the resources (e.g. kmemsize) are accounted -+ in items instead of bytes. -+ -+config UBC_UNLIMITED -+ bool "Use unlimited ubc settings" -+ default y -+ depends on UBC_DEBUG -+ help -+ When ON all limits and barriers are set to max values. -+ -+endmenu -diff -upr linux-2.6.16.orig/kernel/ub/Makefile linux-2.6.16-026test009/kernel/ub/Makefile ---- linux-2.6.16.orig/kernel/ub/Makefile 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/Makefile 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,20 @@ -+# -+# User resources part (UBC) -+# -+# Copyright (C) 2005 SWsoft -+# All rights reserved. -+# -+# Licensing governed by "linux/COPYING.SWsoft" file. -+ -+obj-y := ub_sys.o -+obj-$(CONFIG_USER_RESOURCE) += beancounter.o -+obj-$(CONFIG_USER_RESOURCE) += ub_dcache.o -+obj-$(CONFIG_USER_RESOURCE) += ub_mem.o -+obj-$(CONFIG_USER_RESOURCE) += ub_misc.o -+obj-$(CONFIG_USER_RESOURCE) += ub_net.o -+obj-$(CONFIG_USER_RESOURCE) += ub_pages.o -+obj-$(CONFIG_USER_RESOURCE) += ub_stat.o -+# obj-$(CONFIG_USER_RESOURCE) += ub_oom.o -+ -+obj-$(CONFIG_USER_RSS_ACCOUNTING) += ub_page_bc.o -+obj-$(CONFIG_USER_RESOURCE_PROC) += ub_proc.o -diff -upr linux-2.6.16.orig/kernel/ub/beancounter.c linux-2.6.16-026test009/kernel/ub/beancounter.c ---- linux-2.6.16.orig/kernel/ub/beancounter.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/beancounter.c 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,675 @@ -+/* -+ * linux/kernel/ub/beancounter.c -+ * -+ * Copyright (C) 1998 Alan Cox -+ * 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg> -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * TODO: -+ * - more intelligent limit check in mremap(): currently the new size is -+ * charged and _then_ old size is uncharged -+ * (almost done: !move_vma case is completely done, -+ * move_vma in its current implementation requires too many conditions to -+ * do things right, because it may be not only expansion, but shrinking -+ * also, plus do_munmap will require an additional parameter...) -+ * - problem: bad pmd page handling -+ * - consider /proc redesign -+ * - TCP/UDP ports -+ * + consider whether __charge_beancounter_locked should be inline -+ * -+ * Changes: -+ * 1999/08/17 Marcelo Tosatti <marcelo@conectiva.com.br> -+ * - Set "barrier" and "limit" parts of limits atomically. -+ * 1999/10/06 Marcelo Tosatti <marcelo@conectiva.com.br> -+ * - setublimit system call. -+ */ -+ -+#include <linux/slab.h> -+#include <linux/module.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_hash.h> -+#include <ub/ub_vmpages.h> -+ -+static kmem_cache_t *ub_cachep; -+static struct user_beancounter default_beancounter; -+struct user_beancounter ub0; -+ -+const char *ub_rnames[] = { -+ "kmemsize", /* 0 */ -+ "lockedpages", -+ "privvmpages", -+ "shmpages", -+ "dummy", -+ "numproc", /* 5 */ -+ "physpages", -+ "vmguarpages", -+ "oomguarpages", -+ "numtcpsock", -+ "numflock", /* 10 */ -+ "numpty", -+ "numsiginfo", -+ "tcpsndbuf", -+ "tcprcvbuf", -+ "othersockbuf", /* 15 */ -+ "dgramrcvbuf", -+ "numothersock", -+ "dcachesize", -+ "numfile", -+ "dummy", /* 20 */ -+ "dummy", -+ "dummy", -+ "numiptent", -+ "unused_privvmpages", /* UB_RESOURCES */ -+ "tmpfs_respages", -+ "swap_pages", -+ "held_pages", -+}; -+ -+static void init_beancounter_struct(struct user_beancounter *ub); -+static void init_beancounter_store(struct user_beancounter *ub); -+static void init_beancounter_nolimits(struct user_beancounter *ub); -+ -+void print_ub_uid(struct user_beancounter *ub, char *buf, int size) -+{ -+ if (ub->parent != NULL) -+ snprintf(buf, size, "%u.%u", ub->parent->ub_uid, ub->ub_uid); -+ else -+ snprintf(buf, size, "%u", ub->ub_uid); -+} -+EXPORT_SYMBOL(print_ub_uid); -+ -+#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1)) -+#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17) -+struct ub_hash_slot ub_hash[UB_HASH_SIZE]; -+spinlock_t ub_hash_lock; -+EXPORT_SYMBOL(ub_hash); -+EXPORT_SYMBOL(ub_hash_lock); -+ -+/* -+ * Per user resource beancounting. Resources are tied to their luid. -+ * The resource structure itself is tagged both to the process and -+ * the charging resources (a socket doesn't want to have to search for -+ * things at irq time for example). Reference counters keep things in -+ * hand. -+ * -+ * The case where a user creates resource, kills all his processes and -+ * then starts new ones is correctly handled this way. The refcounters -+ * will mean the old entry is still around with resource tied to it. -+ */ -+struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) -+{ -+ struct user_beancounter *new_ub, *ub; -+ unsigned long flags; -+ struct ub_hash_slot *slot; -+ -+ slot = &ub_hash[ub_hash_fun(uid)]; -+ new_ub = NULL; -+ -+retry: -+ spin_lock_irqsave(&ub_hash_lock, flags); -+ ub = slot->ubh_beans; -+ while (ub != NULL && (ub->ub_uid != uid || ub->parent != NULL)) -+ ub = ub->ub_next; -+ -+ if (ub != NULL) { -+ /* found */ -+ get_beancounter(ub); -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ if (new_ub != NULL) -+ kmem_cache_free(ub_cachep, new_ub); -+ return ub; -+ } -+ -+ if (!create) { -+ /* no ub found */ -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ return NULL; -+ } -+ -+ if (new_ub != NULL) { -+ /* install new ub */ -+ new_ub->ub_next = slot->ubh_beans; -+ slot->ubh_beans = new_ub; -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ return new_ub; -+ } -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ -+ /* alloc new ub */ -+ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, -+ GFP_KERNEL); -+ if (new_ub == NULL) -+ return NULL; -+ -+ ub_debug(UBD_ALLOC, "Creating ub %p in slot %p\n", new_ub, slot); -+ memcpy(new_ub, &default_beancounter, sizeof(*new_ub)); -+ init_beancounter_struct(new_ub); -+ new_ub->ub_uid = uid; -+ goto retry; -+} -+EXPORT_SYMBOL(get_beancounter_byuid); -+ -+struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p, -+ int id, int create) -+{ -+ struct user_beancounter *new_ub, *ub; -+ unsigned long flags; -+ struct ub_hash_slot *slot; -+ -+ slot = &ub_hash[ub_subhash_fun(p, id)]; -+ new_ub = NULL; -+ -+retry: -+ spin_lock_irqsave(&ub_hash_lock, flags); -+ ub = slot->ubh_beans; -+ while (ub != NULL && (ub->parent != p || ub->ub_uid != id)) -+ ub = ub->ub_next; -+ -+ if (ub != NULL) { -+ /* found */ -+ get_beancounter(ub); -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ if (new_ub != NULL) { -+ put_beancounter(new_ub->parent); -+ kmem_cache_free(ub_cachep, new_ub); -+ } -+ return ub; -+ } -+ -+ if (!create) { -+ /* no ub found */ -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ return NULL; -+ } -+ -+ if (new_ub != NULL) { -+ /* install new ub */ -+ get_beancounter(new_ub); -+ new_ub->ub_next = slot->ubh_beans; -+ slot->ubh_beans = new_ub; -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ return new_ub; -+ } -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ -+ /* alloc new ub */ -+ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, -+ GFP_KERNEL); -+ if (new_ub == NULL) -+ return NULL; -+ -+ ub_debug(UBD_ALLOC, "Creating sub %p in slot %p\n", new_ub, slot); -+ memset(new_ub, 0, sizeof(*new_ub)); -+ init_beancounter_nolimits(new_ub); -+ init_beancounter_store(new_ub); -+ init_beancounter_struct(new_ub); -+ atomic_set(&new_ub->ub_refcount, 0); -+ new_ub->ub_uid = id; -+ new_ub->parent = get_beancounter(p); -+ goto retry; -+} -+EXPORT_SYMBOL(get_subbeancounter_byid); -+ -+struct user_beancounter *subbeancounter_findcreate(struct user_beancounter *p, -+ int id) -+{ -+ struct user_beancounter *ub; -+ unsigned long flags; -+ struct ub_hash_slot *slot; -+ -+ slot = &ub_hash[ub_subhash_fun(p, id)]; -+ -+ spin_lock_irqsave(&ub_hash_lock, flags); -+ ub = slot->ubh_beans; -+ while (ub != NULL && (ub->parent != p || ub->ub_uid != id)) -+ ub = ub->ub_next; -+ -+ if (ub != NULL) { -+ /* found */ -+ get_beancounter(ub); -+ goto done; -+ } -+ -+ /* alloc new ub */ -+ /* Can be called from non-atomic contexts. Den */ -+ ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, GFP_ATOMIC); -+ if (ub == NULL) -+ goto done; -+ -+ ub_debug(UBD_ALLOC, "Creating sub %p in slot %p\n", ub, slot); -+ memset(ub, 0, sizeof(*ub)); -+ init_beancounter_nolimits(ub); -+ init_beancounter_store(ub); -+ init_beancounter_struct(ub); -+ atomic_set(&ub->ub_refcount, 0); -+ ub->ub_uid = id; -+ ub->parent = get_beancounter(p); -+ -+ /* install new ub */ -+ get_beancounter(ub); -+ ub->ub_next = slot->ubh_beans; -+ slot->ubh_beans = ub; -+ -+done: -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ return ub; -+} -+EXPORT_SYMBOL(subbeancounter_findcreate); -+#ifndef CONFIG_UBC_KEEP_UNUSED -+ -+static int verify_res(struct user_beancounter *ub, int resource, -+ unsigned long held) -+{ -+ char id[64]; -+ -+ if (likely(held == 0)) -+ return 1; -+ -+ print_ub_uid(ub, id, sizeof(id)); -+ printk(KERN_WARNING "Ub %s helds %lu in %s on put\n", -+ id, held, ub_rnames[resource]); -+ return 0; -+} -+ -+static inline void verify_held(struct user_beancounter *ub) -+{ -+ int i, clean; -+ -+ clean = 1; -+ for (i = 0; i < UB_RESOURCES; i++) -+ clean &= verify_res(ub, i, ub->ub_parms[i].held); -+ -+ clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); -+ clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); -+ clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages); -+ clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); -+ -+ ub_debug_trace(!clean, 5, 60*HZ); -+} -+ -+static void __unhash_beancounter(struct user_beancounter *ub) -+{ -+ struct user_beancounter **ubptr; -+ struct ub_hash_slot *slot; -+ -+ if (ub->parent != NULL) -+ slot = &ub_hash[ub_subhash_fun(ub->parent, ub->ub_uid)]; -+ else -+ slot = &ub_hash[ub_hash_fun(ub->ub_uid)]; -+ ubptr = &slot->ubh_beans; -+ -+ while (*ubptr != NULL) { -+ if (*ubptr == ub) { -+ verify_held(ub); -+ *ubptr = ub->ub_next; -+ return; -+ } -+ ubptr = &((*ubptr)->ub_next); -+ } -+ printk(KERN_ERR "Invalid beancounter %p, luid=%d on free, slot %p\n", -+ ub, ub->ub_uid, slot); -+} -+#endif -+ -+void __put_beancounter(struct user_beancounter *ub) -+{ -+ unsigned long flags; -+ struct user_beancounter *parent; -+ -+again: -+ parent = ub->parent; -+ ub_debug(UBD_ALLOC, "__put bc %p (cnt %d) for %.20s pid %d " -+ "cur %08lx cpu %d.\n", -+ ub, atomic_read(&ub->ub_refcount), -+ current->comm, current->pid, -+ (unsigned long)current, smp_processor_id()); -+ -+ /* equevalent to atomic_dec_and_lock_irqsave() */ -+ local_irq_save(flags); -+ if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) { -+ if (unlikely(atomic_read(&ub->ub_refcount) < 0)) -+ printk(KERN_ERR "UB: Bad ub refcount: ub=%p, " -+ "luid=%d, ref=%d\n", -+ ub, ub->ub_uid, -+ atomic_read(&ub->ub_refcount)); -+ local_irq_restore(flags); -+ return; -+ } -+ -+ if (unlikely(ub == get_ub0())) { -+ printk(KERN_ERR "Trying to put ub0\n"); -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ return; -+ } -+ -+#ifndef CONFIG_UBC_KEEP_UNUSED -+ __unhash_beancounter(ub); -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ ub_free_counters(ub); -+ kmem_cache_free(ub_cachep, ub); -+#else -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+#endif -+ ub = parent; -+ if (ub != NULL) -+ goto again; -+} -+EXPORT_SYMBOL(__put_beancounter); -+ -+/* -+ * Generic resource charging stuff -+ */ -+ -+int __charge_beancounter_locked(struct user_beancounter *ub, -+ int resource, unsigned long val, enum severity strict) -+{ -+ ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n", -+ val, resource, ub, ub->ub_parms[resource].held); -+ /* -+ * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition -+ * at the moment is possible so an overflow is impossible. -+ */ -+ ub->ub_parms[resource].held += val; -+ -+ switch (strict) { -+ case UB_HARD: -+ if (ub->ub_parms[resource].held > -+ ub->ub_parms[resource].barrier) -+ break; -+ case UB_SOFT: -+ if (ub->ub_parms[resource].held > -+ ub->ub_parms[resource].limit) -+ break; -+ case UB_FORCE: -+ ub_adjust_maxheld(ub, resource); -+ return 0; -+ default: -+ BUG(); -+ } -+ -+ if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl)) -+ printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n", -+ ub_rnames[resource], ub->ub_uid); -+ ub->ub_parms[resource].failcnt++; -+ ub->ub_parms[resource].held -= val; -+ return -ENOMEM; -+} -+ -+int charge_beancounter(struct user_beancounter *ub, -+ int resource, unsigned long val, enum severity strict) -+{ -+ int retval; -+ struct user_beancounter *p, *q; -+ unsigned long flags; -+ -+ retval = -EINVAL; -+ if (val > UB_MAXVALUE) -+ goto out; -+ -+ local_irq_save(flags); -+ for (p = ub; p != NULL; p = p->parent) { -+ spin_lock(&p->ub_lock); -+ retval = __charge_beancounter_locked(p, resource, val, strict); -+ spin_unlock(&p->ub_lock); -+ if (retval) -+ goto unroll; -+ } -+out_restore: -+ local_irq_restore(flags); -+out: -+ return retval; -+ -+unroll: -+ for (q = ub; q != p; q = q->parent) { -+ spin_lock(&q->ub_lock); -+ __uncharge_beancounter_locked(q, resource, val); -+ spin_unlock(&q->ub_lock); -+ } -+ goto out_restore; -+} -+ -+EXPORT_SYMBOL(charge_beancounter); -+ -+void charge_beancounter_notop(struct user_beancounter *ub, -+ int resource, unsigned long val) -+{ -+ struct user_beancounter *p; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ for (p = ub; p->parent != NULL; p = p->parent) { -+ spin_lock(&p->ub_lock); -+ __charge_beancounter_locked(p, resource, val, UB_FORCE); -+ spin_unlock(&p->ub_lock); -+ } -+ local_irq_restore(flags); -+} -+ -+EXPORT_SYMBOL(charge_beancounter_notop); -+ -+void uncharge_warn(struct user_beancounter *ub, int resource, -+ unsigned long val, unsigned long held) -+{ -+ char id[64]; -+ -+ print_ub_uid(ub, id, sizeof(id)); -+ printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n", -+ val, held, ub_rnames[resource], id); -+ ub_debug_trace(1, 10, 10*HZ); -+} -+ -+void __uncharge_beancounter_locked(struct user_beancounter *ub, -+ int resource, unsigned long val) -+{ -+ ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n", -+ val, resource, ub, ub->ub_parms[resource].held); -+ if (ub->ub_parms[resource].held < val) { -+ uncharge_warn(ub, resource, -+ val, ub->ub_parms[resource].held); -+ val = ub->ub_parms[resource].held; -+ } -+ ub->ub_parms[resource].held -= val; -+} -+ -+void uncharge_beancounter(struct user_beancounter *ub, -+ int resource, unsigned long val) -+{ -+ unsigned long flags; -+ struct user_beancounter *p; -+ -+ for (p = ub; p != NULL; p = p->parent) { -+ spin_lock_irqsave(&p->ub_lock, flags); -+ __uncharge_beancounter_locked(p, resource, val); -+ spin_unlock_irqrestore(&p->ub_lock, flags); -+ } -+} -+ -+EXPORT_SYMBOL(uncharge_beancounter); -+ -+void uncharge_beancounter_notop(struct user_beancounter *ub, -+ int resource, unsigned long val) -+{ -+ struct user_beancounter *p; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ for (p = ub; p->parent != NULL; p = p->parent) { -+ spin_lock(&p->ub_lock); -+ __uncharge_beancounter_locked(p, resource, val); -+ spin_unlock(&p->ub_lock); -+ } -+ local_irq_restore(flags); -+} -+ -+EXPORT_SYMBOL(uncharge_beancounter_notop); -+ -+ -+/* -+ * Rate limiting stuff. -+ */ -+int ub_ratelimit(struct ub_rate_info *p) -+{ -+ unsigned long cjif, djif; -+ unsigned long flags; -+ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; -+ long new_bucket; -+ -+ spin_lock_irqsave(&ratelimit_lock, flags); -+ cjif = jiffies; -+ djif = cjif - p->last; -+ if (djif < p->interval) { -+ if (p->bucket >= p->burst) { -+ spin_unlock_irqrestore(&ratelimit_lock, flags); -+ return 0; -+ } -+ p->bucket++; -+ } else { -+ new_bucket = p->bucket - (djif / (unsigned)p->interval); -+ if (new_bucket < 0) -+ new_bucket = 0; -+ p->bucket = new_bucket + 1; -+ } -+ p->last = cjif; -+ spin_unlock_irqrestore(&ratelimit_lock, flags); -+ return 1; -+} -+EXPORT_SYMBOL(ub_ratelimit); -+ -+ -+/* -+ * Initialization -+ * -+ * struct user_beancounter contains -+ * - limits and other configuration settings, -+ * with a copy stored for accounting purposes, -+ * - structural fields: lists, spinlocks and so on. -+ * -+ * Before these parts are initialized, the structure should be memset -+ * to 0 or copied from a known clean structure. That takes care of a lot -+ * of fields not initialized explicitly. -+ */ -+ -+static void init_beancounter_struct(struct user_beancounter *ub) -+{ -+ ub->ub_magic = UB_MAGIC; -+ atomic_set(&ub->ub_refcount, 1); -+ spin_lock_init(&ub->ub_lock); -+ INIT_LIST_HEAD(&ub->ub_tcp_sk_list); -+ INIT_LIST_HEAD(&ub->ub_other_sk_list); -+#ifdef CONFIG_UBC_DEBUG_KMEM -+ INIT_LIST_HEAD(&ub->ub_cclist); -+#endif -+} -+ -+static void init_beancounter_store(struct user_beancounter *ub) -+{ -+ int k; -+ -+ for (k = 0; k < UB_RESOURCES; k++) { -+ memcpy(&ub->ub_store[k], &ub->ub_parms[k], -+ sizeof(struct ubparm)); -+ } -+} -+ -+static void init_beancounter_nolimits(struct user_beancounter *ub) -+{ -+ int k; -+ -+ for (k = 0; k < UB_RESOURCES; k++) { -+ ub->ub_parms[k].limit = UB_MAXVALUE; -+ /* FIXME: whether this is right for physpages and guarantees? */ -+ ub->ub_parms[k].barrier = UB_MAXVALUE; -+ } -+ -+ /* FIXME: set unlimited rate? */ -+ ub->ub_limit_rl.burst = 4; -+ ub->ub_limit_rl.interval = 300*HZ; -+} -+ -+static void init_beancounter_syslimits(struct user_beancounter *ub, -+ unsigned long mp) -+{ -+ extern int max_threads; -+ int k; -+ -+ ub->ub_parms[UB_KMEMSIZE].limit = -+ mp > (192*1024*1024 >> PAGE_SHIFT) ? -+ 32*1024*1024 : (mp << PAGE_SHIFT) / 6; -+ ub->ub_parms[UB_LOCKEDPAGES].limit = 8; -+ ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE; -+ ub->ub_parms[UB_SHMPAGES].limit = 64; -+ ub->ub_parms[UB_NUMPROC].limit = max_threads / 2; -+ ub->ub_parms[UB_NUMTCPSOCK].limit = 1024; -+ ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */ -+ ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */ -+ ub->ub_parms[UB_NUMOTHERSOCK].limit = 256; -+ ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */ -+ ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */ -+ ub->ub_parms[UB_NUMFLOCK].limit = 1024; -+ ub->ub_parms[UB_NUMPTY].limit = 16; -+ ub->ub_parms[UB_NUMSIGINFO].limit = 1024; -+ ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; -+ ub->ub_parms[UB_NUMFILE].limit = 1024; -+ -+ for (k = 0; k < UB_RESOURCES; k++) -+ ub->ub_parms[k].barrier = ub->ub_parms[k].limit; -+ -+ ub->ub_limit_rl.burst = 4; -+ ub->ub_limit_rl.interval = 300*HZ; -+} -+ -+void __init ub_init_ub0(void) -+{ -+ struct user_beancounter *ub; -+ -+ init_cache_counters(); -+ ub = get_ub0(); -+ memset(ub, 0, sizeof(*ub)); -+ ub->ub_uid = 0; -+ init_beancounter_nolimits(ub); -+ init_beancounter_store(ub); -+ init_beancounter_struct(ub); -+ -+ memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); -+ (void)set_exec_ub(get_ub0()); -+ current->task_bc.fork_sub = get_beancounter(get_ub0()); -+ init_mm.mm_ub = get_beancounter(ub); -+} -+ -+void __init ub_hash_init(void) -+{ -+ struct ub_hash_slot *slot; -+ -+ spin_lock_init(&ub_hash_lock); -+ /* insert ub0 into the hash */ -+ slot = &ub_hash[ub_hash_fun(get_ub0()->ub_uid)]; -+ slot->ubh_beans = get_ub0(); -+} -+ -+void __init ub_init_cache(unsigned long mempages) -+{ -+ extern int skbc_cache_init(void); -+ int res; -+ -+ res = 0; /* skbc_cache_init(); */ -+ ub_cachep = kmem_cache_create("user_beancounters", -+ sizeof(struct user_beancounter), -+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); -+ if (res < 0 || ub_cachep == NULL) -+ panic("Can't create ubc caches\n"); -+ -+ memset(&default_beancounter, 0, sizeof(default_beancounter)); -+#ifdef CONFIG_UBC_UNLIMITED -+ init_beancounter_nolimits(&default_beancounter); -+#else -+ init_beancounter_syslimits(&default_beancounter, mempages); -+#endif -+ init_beancounter_store(&default_beancounter); -+ init_beancounter_struct(&default_beancounter); -+ -+ ub_hash_init(); -+} -diff -upr linux-2.6.16.orig/kernel/ub/ub_dcache.c linux-2.6.16-026test009/kernel/ub/ub_dcache.c ---- linux-2.6.16.orig/kernel/ub/ub_dcache.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_dcache.c 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,325 @@ -+/* -+ * kernel/ub/ub_dcache.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/dcache.h> -+#include <linux/slab.h> -+#include <linux/kmem_cache.h> -+#include <linux/fs.h> -+#include <linux/err.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_mem.h> -+#include <ub/ub_dcache.h> -+ -+/* -+ * Locking -+ * traverse dcache_lock d_lock -+ * ub_dentry_charge + + + -+ * ub_dentry_uncharge + - + -+ * ub_dentry_charge_nofail + + - -+ * -+ * d_inuse is atomic so that we can inc dentry's parent d_inuse in -+ * ub_dentry_charhe with the only dentry's d_lock held. -+ * -+ * Race in uncharge vs charge_nofail is handled with dcache_lock. -+ * Race in charge vs charge_nofail is inessential since they both inc d_inuse. -+ * Race in uncharge vs charge is handled by altering d_inuse under d_lock. -+ * -+ * Race with d_move is handled this way: -+ * - charge_nofail and uncharge are protected by dcache_lock; -+ * - charge works only with dentry and dentry->d_parent->d_inuse, so -+ * it's enough to lock only the dentry. -+ */ -+ -+/* -+ * Beancounting -+ * UB argument must NOT be NULL -+ */ -+ -+static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, -+ enum severity sv) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv)) -+ goto out_mem; -+ if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv)) -+ goto out_dcache; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ return 0; -+ -+out_dcache: -+ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); -+out_mem: -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ return -ENOMEM; -+} -+ -+static void do_uncharge_dcache(struct user_beancounter *ub, -+ unsigned long size) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); -+ __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+static int charge_dcache(struct user_beancounter *ub, unsigned long size, -+ enum severity sv) -+{ -+ struct user_beancounter *p, *q; -+ -+ for (p = ub; p != NULL; p = p->parent) { -+ if (do_charge_dcache(p, size, sv)) -+ goto unroll; -+ } -+ return 0; -+ -+unroll: -+ for (q = ub; q != p; q = q->parent) -+ do_uncharge_dcache(q, size); -+ return -ENOMEM; -+} -+ -+void uncharge_dcache(struct user_beancounter *ub, unsigned long size) -+{ -+ for (; ub != NULL; ub = ub->parent) -+ do_uncharge_dcache(ub, size); -+} -+ -+static inline void charge_dcache_forced(struct user_beancounter *ub, -+ unsigned long size) -+{ -+ charge_dcache(ub, size, UB_FORCE); -+} -+ -+static inline void d_forced_charge(struct dentry_beancounter *d_bc) -+{ -+ d_bc->d_ub = get_beancounter(get_exec_ub()); -+ if (d_bc->d_ub == NULL) -+ return; -+ -+ charge_dcache_forced(d_bc->d_ub, d_bc->d_ubsize); -+} -+ -+static inline void d_uncharge(struct dentry_beancounter *d_bc) -+{ -+ if (d_bc->d_ub == NULL) -+ return; -+ -+ uncharge_dcache(d_bc->d_ub, d_bc->d_ubsize); -+ put_beancounter(d_bc->d_ub); -+ d_bc->d_ub = NULL; -+} -+ -+/* -+ * Alloc / free dentry_beancounter -+ */ -+ -+static inline int d_alloc_beancounter(struct dentry *d) -+{ -+ return 0; -+} -+ -+static inline void d_free_beancounter(struct dentry_beancounter *d_bc) -+{ -+} -+ -+static inline unsigned long d_charge_size(struct dentry *dentry) -+{ -+ /* dentry's d_name is already set to appropriate value (see d_alloc) */ -+ return inode_cachep->objuse + dentry_cache->objuse + -+ (dname_external(dentry) ? -+ kmem_obj_memusage((void *)dentry->d_name.name) : 0); -+} -+ -+/* -+ * dentry mark in use operation -+ * d_lock is held -+ */ -+ -+static int d_inc_inuse(struct dentry *dentry) -+{ -+ struct user_beancounter *ub; -+ struct dentry_beancounter *d_bc; -+ -+ if (dentry != dentry->d_parent) { -+ struct dentry *parent; -+ -+ /* -+ * Increment d_inuse of parent. -+ * It can't change since dentry->d_lock is held. -+ */ -+ parent = dentry->d_parent; -+ if (ub_dget_testone(parent)) -+ BUG(); -+ } -+ -+ d_bc = &dentry->dentry_bc; -+ ub = get_beancounter(get_exec_ub()); -+ -+ if (ub != NULL && charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) -+ goto out_err; -+ -+ d_bc->d_ub = ub; -+ return 0; -+ -+out_err: -+ put_beancounter(ub); -+ d_bc->d_ub = NULL; -+ return -ENOMEM; -+} -+ -+/* -+ * no locks -+ */ -+int ub_dentry_alloc(struct dentry *dentry) -+{ -+ int err; -+ struct dentry_beancounter *d_bc; -+ -+ err = d_alloc_beancounter(dentry); -+ if (err < 0) -+ return err; -+ -+ d_bc = &dentry->dentry_bc; -+ d_bc->d_ub = get_beancounter(get_exec_ub()); -+ atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in ub_dcache.h */ -+ d_bc->d_ubsize = d_charge_size(dentry); -+ -+ err = 0; -+ if (d_bc->d_ub != NULL && -+ charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) { -+ put_beancounter(d_bc->d_ub); -+ d_free_beancounter(d_bc); -+ err = -ENOMEM; -+ } -+ -+ return err; -+} -+ -+/* -+ * Charge / uncharge functions. -+ * -+ * We take d_lock to protect dentry_bc from concurrent acces -+ * when simultaneous __d_lookup and d_put happens on one dentry. -+ */ -+ -+/* -+ * no dcache_lock, d_lock and rcu_read_lock are held -+ * drops d_lock, rcu_read_lock and returns error if any -+ */ -+int ub_dentry_charge(struct dentry *dentry) -+{ -+ int err; -+ -+ err = 0; -+ if (ub_dget_testone(dentry)) -+ err = d_inc_inuse(dentry); -+ -+ /* -+ * d_lock and rcu_read_lock are dropped here -+ * (see also __d_lookup) -+ */ -+ spin_unlock(&dentry->d_lock); -+ rcu_read_unlock(); -+ -+ if (!err) -+ return 0; -+ -+ /* -+ * d_invlaidate is required for real_lookup -+ * since it tries to create new dentry on -+ * d_lookup failure. -+ */ -+ if (!d_invalidate(dentry)) -+ return err; -+ -+ /* didn't succeeded, force dentry to be charged */ -+ d_forced_charge(&dentry->dentry_bc); -+ return 0; -+} -+ -+/* -+ * dcache_lock is held -+ * no d_locks, sequentaly takes and drops from dentry upward -+ */ -+void ub_dentry_uncharge(struct dentry *dentry) -+{ -+ struct dentry *parent; -+ -+ /* go up until status is changed and root is not reached */ -+ while (1) { -+ /* -+ * We need d_lock here to handle -+ * the race with ub_dentry_charge -+ */ -+ spin_lock(&dentry->d_lock); -+ if (!ub_dput_testzero(dentry)) { -+ spin_unlock(&dentry->d_lock); -+ break; -+ } -+ -+ /* state transition 0 => -1 */ -+ d_uncharge(&dentry->dentry_bc); -+ parent = dentry->d_parent; -+ spin_unlock(&dentry->d_lock); -+ -+ /* -+ * dcache_lock is held (see comment in __dget_locked) -+ * so we can safely move upwards. -+ */ -+ if (dentry == parent) -+ break; -+ dentry = parent; -+ } -+} -+ -+/* -+ * forced version. for dget in clean cache, when error is not an option -+ * -+ * dcache_lock is held -+ * no d_locks -+ */ -+void ub_dentry_charge_nofail(struct dentry *dentry) -+{ -+ struct dentry *parent; -+ -+ /* go up until status is changed and root is not reached */ -+ while (1) { -+ if (!ub_dget_testone(dentry)) -+ break; -+ -+ /* -+ * state transition -1 => 0 -+ * -+ * No need to lock dentry before atomic_inc -+ * like we do in ub_dentry_uncharge. -+ * We can't race with ub_dentry_uncharge due -+ * to dcache_lock. The only possible race with -+ * ub_dentry_charge is OK since they both -+ * do atomic_inc. -+ */ -+ d_forced_charge(&dentry->dentry_bc); -+ /* -+ * dcache_lock is held (see comment in __dget_locked) -+ * so we can safely move upwards. -+ */ -+ parent = dentry->d_parent; -+ -+ if (dentry == parent) -+ break; -+ dentry = parent; -+ } -+} -diff -upr linux-2.6.16.orig/kernel/ub/ub_mem.c linux-2.6.16-026test009/kernel/ub/ub_mem.c ---- linux-2.6.16.orig/kernel/ub/ub_mem.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_mem.c 2006-04-19 15:02:11.000000000 +0400 -@@ -0,0 +1,384 @@ -+/* -+ * kernel/ub/ub_mem.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/slab.h> -+#include <linux/kmem_cache.h> -+#include <linux/kmem_slab.h> -+#include <linux/highmem.h> -+#include <linux/vmalloc.h> -+#include <linux/mm.h> -+#include <linux/gfp.h> -+#include <linux/swap.h> -+#include <linux/spinlock.h> -+#include <linux/sched.h> -+#include <linux/module.h> -+#include <ub/beancounter.h> -+#include <ub/ub_mem.h> -+#include <ub/ub_hash.h> -+ -+/* -+ * Initialization -+ */ -+ -+/* -+ * Slab accounting -+ */ -+ -+#ifdef CONFIG_UBC_DEBUG_KMEM -+ -+#define CC_HASH_SIZE 1024 -+static struct ub_cache_counter *cc_hash[CC_HASH_SIZE]; -+spinlock_t cc_lock; -+ -+static void __free_cache_counters(struct user_beancounter *ub, -+ kmem_cache_t *cachep) -+{ -+ struct ub_cache_counter *cc, **pprev, *del; -+ int i; -+ unsigned long flags; -+ -+ del = NULL; -+ spin_lock_irqsave(&cc_lock, flags); -+ for (i = 0; i < CC_HASH_SIZE; i++) { -+ pprev = &cc_hash[i]; -+ cc = cc_hash[i]; -+ while (cc != NULL) { -+ if (cc->ub != ub && cc->cachep != cachep) { -+ pprev = &cc->next; -+ cc = cc->next; -+ continue; -+ } -+ -+ list_del(&cc->ulist); -+ *pprev = cc->next; -+ cc->next = del; -+ del = cc; -+ cc = *pprev; -+ } -+ } -+ spin_unlock_irqrestore(&cc_lock, flags); -+ -+ while (del != NULL) { -+ cc = del->next; -+ kfree(del); -+ del = cc; -+ } -+} -+ -+void ub_free_counters(struct user_beancounter *ub) -+{ -+ __free_cache_counters(ub, NULL); -+} -+ -+void ub_kmemcache_free(kmem_cache_t *cachep) -+{ -+ __free_cache_counters(NULL, cachep); -+} -+ -+void __init init_cache_counters(void) -+{ -+ memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0])); -+ spin_lock_init(&cc_lock); -+} -+ -+#define cc_hash_fun(ub, cachep) ( \ -+ (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \ -+ ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \ -+ ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \ -+ ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \ -+ ) & (CC_HASH_SIZE - 1)) -+ -+static int change_slab_charged(struct user_beancounter *ub, void *objp, -+ unsigned long val, int mask) -+{ -+ struct ub_cache_counter *cc, *new_cnt, **pprev; -+ kmem_cache_t *cachep; -+ unsigned long flags; -+ -+ cachep = virt_to_cache(objp); -+ new_cnt = NULL; -+ -+again: -+ spin_lock_irqsave(&cc_lock, flags); -+ cc = cc_hash[cc_hash_fun(ub, cachep)]; -+ while (cc) { -+ if (cc->ub == ub && cc->cachep == cachep) -+ goto found; -+ cc = cc->next; -+ } -+ -+ if (new_cnt != NULL) -+ goto insert; -+ -+ spin_unlock_irqrestore(&cc_lock, flags); -+ -+ new_cnt = kmalloc(sizeof(*new_cnt), mask & ~__GFP_UBC); -+ if (new_cnt == NULL) -+ return -ENOMEM; -+ -+ new_cnt->counter = 0; -+ new_cnt->ub = ub; -+ new_cnt->cachep = cachep; -+ goto again; -+ -+insert: -+ pprev = &cc_hash[cc_hash_fun(ub, cachep)]; -+ new_cnt->next = *pprev; -+ *pprev = new_cnt; -+ list_add(&new_cnt->ulist, &ub->ub_cclist); -+ cc = new_cnt; -+ new_cnt = NULL; -+ -+found: -+ cc->counter += val; -+ spin_unlock_irqrestore(&cc_lock, flags); -+ if (new_cnt) -+ kfree(new_cnt); -+ return 0; -+} -+ -+static inline int inc_slab_charged(struct user_beancounter *ub, -+ void *objp, int mask) -+{ -+ return change_slab_charged(ub, objp, 1, mask); -+} -+ -+static inline void dec_slab_charged(struct user_beancounter *ub, void *objp) -+{ -+ if (change_slab_charged(ub, objp, -1, 0) < 0) -+ BUG(); -+} -+ -+#include <linux/vmalloc.h> -+ -+static inline int inc_pages_charged(struct user_beancounter *ub, -+ struct page *pg, int order) -+{ -+ int cpu; -+ -+ cpu = get_cpu(); -+ ub->ub_stat[cpu].pages_charged += (1 << order); -+ put_cpu(); -+ return 0; -+} -+ -+static inline void dec_pages_charged(struct user_beancounter *ub, -+ struct page *pg, int order) -+{ -+ int cpu; -+ -+ cpu = get_cpu(); -+ ub->ub_stat[cpu].pages_charged -= (1 << order); -+ put_cpu(); -+} -+ -+void inc_vmalloc_charged(struct vm_struct *vm, int flags) -+{ -+ int cpu; -+ struct user_beancounter *ub; -+ -+ if (!(flags & __GFP_UBC)) -+ return; -+ -+ ub = get_exec_ub(); -+ if (ub == NULL) -+ return; -+ -+ cpu = get_cpu(); -+ ub->ub_stat[cpu].vmalloc_charged += vm->nr_pages; -+ put_cpu(); -+} -+ -+void dec_vmalloc_charged(struct vm_struct *vm) -+{ -+ int cpu; -+ struct user_beancounter *ub; -+ -+ ub = page_ub(vm->pages[0]); -+ if (ub == NULL) -+ return; -+ -+ cpu = get_cpu(); -+ ub->ub_stat[cpu].vmalloc_charged -= vm->nr_pages; -+ put_cpu(); -+} -+ -+#else -+#define inc_slab_charged(ub, o, m) (0) -+#define dec_slab_charged(ub, o) do { } while (0) -+#define inc_pages_charged(ub, pg, o) (0) -+#define dec_pages_charged(ub, pg, o) do { } while (0) -+#endif -+ -+static inline struct user_beancounter **slab_ub_ref(void *objp) -+{ -+ kmem_cache_t *cachep; -+ struct slab *slabp; -+ int objnr; -+ -+ cachep = virt_to_cache(objp); -+ BUG_ON(!(cachep->flags & SLAB_UBC)); -+ slabp = virt_to_slab(objp); -+ objnr = (objp - slabp->s_mem) / cachep->buffer_size; -+ return slab_ubcs(cachep, slabp) + objnr; -+} -+ -+struct user_beancounter *slab_ub(void *objp) -+{ -+ struct user_beancounter **ub_ref; -+ -+ ub_ref = slab_ub_ref(objp); -+ return *ub_ref; -+} -+ -+EXPORT_SYMBOL(slab_ub); -+ -+static inline int should_charge(void *objp, int flags) -+{ -+ kmem_cache_t *cachep; -+ -+ cachep = virt_to_cache(objp); -+ if (!(cachep->flags & SLAB_UBC)) -+ return 0; -+ if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC)) -+ return 0; -+ return 1; -+} -+ -+#define should_uncharge(objp) should_charge(objp, __GFP_UBC) -+ -+int ub_slab_charge(void *objp, int flags) -+{ -+ unsigned int size; -+ struct user_beancounter *ub; -+ -+ if (!should_charge(objp, flags)) -+ return 0; -+ -+ ub = get_beancounter(get_exec_ub()); -+ if (ub == NULL) -+ return 0; -+ -+ size = CHARGE_SIZE(kmem_obj_memusage(objp)); -+ if (charge_beancounter(ub, UB_KMEMSIZE, size, -+ (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) -+ goto out_err; -+ -+ if (inc_slab_charged(ub, objp, flags) < 0) { -+ uncharge_beancounter(ub, UB_KMEMSIZE, size); -+ goto out_err; -+ } -+ *slab_ub_ref(objp) = ub; -+ return 0; -+ -+out_err: -+ put_beancounter(ub); -+ return -ENOMEM; -+} -+ -+void ub_slab_uncharge(void *objp) -+{ -+ unsigned int size; -+ struct user_beancounter **ub_ref; -+ -+ if (!should_uncharge(objp)) -+ return; -+ -+ ub_ref = slab_ub_ref(objp); -+ if (*ub_ref == NULL) -+ return; -+ -+ dec_slab_charged(*ub_ref, objp); -+ size = CHARGE_SIZE(kmem_obj_memusage(objp)); -+ uncharge_beancounter(*ub_ref, UB_KMEMSIZE, size); -+ put_beancounter(*ub_ref); -+ *ub_ref = NULL; -+} -+ -+/* -+ * Pages accounting -+ */ -+ -+inline int ub_page_charge(struct page *page, int order, int mask) -+{ -+ struct user_beancounter *ub; -+ -+ ub = NULL; -+ if (!(mask & __GFP_UBC)) -+ goto out; -+ -+ ub = get_beancounter(get_exec_ub()); -+ if (ub == NULL) -+ goto out; -+ -+ if (charge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order), -+ (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) -+ goto err; -+ if (inc_pages_charged(ub, page, order) < 0) { -+ uncharge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order)); -+ goto err; -+ } -+out: -+ BUG_ON(page_ub(page) != NULL); -+ page_ub(page) = ub; -+ return 0; -+ -+err: -+ BUG_ON(page_ub(page) != NULL); -+ put_beancounter(ub); -+ return -ENOMEM; -+} -+ -+inline void ub_page_uncharge(struct page *page, int order) -+{ -+ struct user_beancounter *ub; -+ -+ ub = page_ub(page); -+ if (ub == NULL) -+ return; -+ -+ dec_pages_charged(ub, page, order); -+ BUG_ON(ub->ub_magic != UB_MAGIC); -+ uncharge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order)); -+ put_beancounter(ub); -+ page_ub(page) = NULL; -+} -+ -+/* -+ * takes init_mm.page_table_lock -+ * some outer lock to protect pages from vmalloced area must be held -+ */ -+struct user_beancounter *vmalloc_ub(void *obj) -+{ -+ struct page *pg; -+ -+ pg = vmalloc_to_page(obj); -+ if (pg == NULL) -+ return NULL; -+ -+ return page_ub(pg); -+} -+ -+EXPORT_SYMBOL(vmalloc_ub); -+ -+struct user_beancounter *mem_ub(void *obj) -+{ -+ struct user_beancounter *ub; -+ -+ if ((unsigned long)obj >= VMALLOC_START && -+ (unsigned long)obj < VMALLOC_END) -+ ub = vmalloc_ub(obj); -+ else -+ ub = slab_ub(obj); -+ -+ return ub; -+} -+ -+EXPORT_SYMBOL(mem_ub); -diff -upr linux-2.6.16.orig/kernel/ub/ub_misc.c linux-2.6.16-026test009/kernel/ub/ub_misc.c ---- linux-2.6.16.orig/kernel/ub/ub_misc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_misc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,244 @@ -+/* -+ * kernel/ub/ub_misc.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/tty.h> -+#include <linux/tty_driver.h> -+#include <linux/signal.h> -+#include <linux/slab.h> -+#include <linux/fs.h> -+#include <linux/sched.h> -+#include <linux/kmem_cache.h> -+#include <linux/module.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_mem.h> -+ -+/* -+ * Task staff -+ */ -+ -+static void init_task_sub(struct task_struct *tsk, -+ struct task_beancounter *old_bc) -+{ -+ struct task_beancounter *new_bc; -+ struct user_beancounter *sub; -+ -+ new_bc = &tsk->task_bc; -+ sub = old_bc->fork_sub; -+ new_bc->fork_sub = get_beancounter(sub); -+ new_bc->task_fnode = NULL; -+ new_bc->task_freserv = old_bc->task_freserv; -+ old_bc->task_freserv = NULL; -+ memset(&new_bc->task_data, 0, sizeof(new_bc->task_data)); -+} -+ -+int ub_task_charge(struct task_struct *parent, struct task_struct *task) -+{ -+ struct task_beancounter *old_bc; -+ struct task_beancounter *new_bc; -+ struct user_beancounter *ub; -+ -+ old_bc = &parent->task_bc; -+#if 0 -+ if (old_bc->exec_ub == NULL) { -+ /* FIXME: this won't work if task_bc is outside task_struct */ -+ init_task_sub(task, old_bc); -+ return 0; -+ } -+#endif -+ ub = old_bc->fork_sub; -+ -+ if (charge_beancounter(ub, UB_NUMPROC, 1, UB_HARD) < 0) -+ return -ENOMEM; -+ -+ new_bc = &task->task_bc; -+ new_bc->task_ub = get_beancounter(ub); -+ new_bc->exec_ub = get_beancounter(ub); -+ init_task_sub(task, old_bc); -+ return 0; -+} -+ -+void ub_task_uncharge(struct task_struct *task) -+{ -+ struct task_beancounter *task_bc; -+ -+ task_bc = &task->task_bc; -+ if (task_bc->task_ub != NULL) -+ uncharge_beancounter(task_bc->task_ub, UB_NUMPROC, 1); -+ -+ put_beancounter(task_bc->exec_ub); -+ put_beancounter(task_bc->task_ub); -+ put_beancounter(task_bc->fork_sub); -+ /* can't be freed elsewhere, failures possible in the middle of fork */ -+ if (task_bc->task_freserv != NULL) -+ kfree(task_bc->task_freserv); -+ -+ task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc; -+} -+ -+/* -+ * Files and file locks. -+ */ -+ -+int ub_file_charge(struct file *f) -+{ -+ struct user_beancounter *ub; -+ -+ /* No need to get_beancounter here since it's already got in slab */ -+ ub = slab_ub(f); -+ if (ub == NULL) -+ return 0; -+ -+ return charge_beancounter(ub, UB_NUMFILE, 1, UB_HARD); -+} -+ -+void ub_file_uncharge(struct file *f) -+{ -+ struct user_beancounter *ub; -+ -+ /* Ub will be put in slab */ -+ ub = slab_ub(f); -+ if (ub == NULL) -+ return; -+ -+ uncharge_beancounter(ub, UB_NUMFILE, 1); -+} -+ -+int ub_flock_charge(struct file_lock *fl, int hard) -+{ -+ struct user_beancounter *ub; -+ int err; -+ -+ /* No need to get_beancounter here since it's already got in slab */ -+ ub = slab_ub(fl); -+ if (ub == NULL) -+ return 0; -+ -+ err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT); -+ if (!err) -+ fl->fl_charged = 1; -+ return err; -+} -+ -+void ub_flock_uncharge(struct file_lock *fl) -+{ -+ struct user_beancounter *ub; -+ -+ /* Ub will be put in slab */ -+ ub = slab_ub(fl); -+ if (ub == NULL || !fl->fl_charged) -+ return; -+ -+ uncharge_beancounter(ub, UB_NUMFLOCK, 1); -+ fl->fl_charged = 0; -+} -+ -+/* -+ * Signal handling -+ */ -+ -+static int do_ub_siginfo_charge(struct user_beancounter *ub, -+ unsigned long size) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD)) -+ goto out_kmem; -+ -+ if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD)) -+ goto out_num; -+ -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ return 0; -+ -+out_num: -+ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); -+out_kmem: -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ return -ENOMEM; -+} -+ -+static void do_ub_siginfo_uncharge(struct user_beancounter *ub, -+ unsigned long size) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); -+ __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub) -+{ -+ unsigned long size; -+ struct user_beancounter *p, *q; -+ -+ size = CHARGE_SIZE(kmem_obj_memusage(sq)); -+ for (p = ub; p != NULL; p = p->parent) { -+ if (do_ub_siginfo_charge(p, size)) -+ goto unroll; -+ } -+ -+ sq->sig_ub = get_beancounter(ub); -+ return 0; -+ -+unroll: -+ for (q = ub; q != p; q = q->parent) -+ do_ub_siginfo_uncharge(q, size); -+ return -ENOMEM; -+} -+EXPORT_SYMBOL(ub_siginfo_charge); -+ -+void ub_siginfo_uncharge(struct sigqueue *sq) -+{ -+ unsigned long size; -+ struct user_beancounter *ub, *p; -+ -+ p = ub = sq->sig_ub; -+ sq->sig_ub = NULL; -+ size = CHARGE_SIZE(kmem_obj_memusage(sq)); -+ for (; ub != NULL; ub = ub->parent) -+ do_ub_siginfo_uncharge(ub, size); -+ put_beancounter(p); -+} -+ -+/* -+ * PTYs -+ */ -+ -+int ub_pty_charge(struct tty_struct *tty) -+{ -+ struct user_beancounter *ub; -+ int retval; -+ -+ ub = slab_ub(tty); -+ retval = 0; -+ if (ub && tty->driver->subtype == PTY_TYPE_MASTER && -+ !test_bit(TTY_CHARGED, &tty->flags)) { -+ retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD); -+ if (!retval) -+ set_bit(TTY_CHARGED, &tty->flags); -+ } -+ return retval; -+} -+ -+void ub_pty_uncharge(struct tty_struct *tty) -+{ -+ struct user_beancounter *ub; -+ -+ ub = slab_ub(tty); -+ if (ub && tty->driver->subtype == PTY_TYPE_MASTER && -+ test_bit(TTY_CHARGED, &tty->flags)) { -+ uncharge_beancounter(ub, UB_NUMPTY, 1); -+ clear_bit(TTY_CHARGED, &tty->flags); -+ } -+} -diff -upr linux-2.6.16.orig/kernel/ub/ub_net.c linux-2.6.16-026test009/kernel/ub/ub_net.c ---- linux-2.6.16.orig/kernel/ub/ub_net.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_net.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,1043 @@ -+/* -+ * linux/kernel/ub/ub_net.c -+ * -+ * Copyright (C) 1998-2004 Andrey V. Savochkin <saw@saw.sw.com.sg> -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * TODO: -+ * - sizeof(struct inode) charge -+ * = tcp_mem_schedule() feedback based on ub limits -+ * + measures so that one socket won't exhaust all send buffers, -+ * see bug in bugzilla -+ * = sk->socket check for NULL in snd_wakeups -+ * (tcp_write_space checks for NULL itself) -+ * + in tcp_close(), orphaned socket abortion should be based on ubc -+ * resources (same in tcp_out_of_resources) -+ * Beancounter should also have separate orphaned socket counter... -+ * + for rcv, in-order segment should be accepted -+ * if only barrier is exceeded -+ * = tcp_rmem_schedule() feedback based on ub limits -+ * - repair forward_alloc mechanism for receive buffers -+ * It's idea is that some buffer space is pre-charged so that receive fast -+ * path doesn't need to take spinlocks and do other heavy stuff -+ * + tcp_prune_queue actions based on ub limits -+ * + window adjustments depending on available buffers for receive -+ * - window adjustments depending on available buffers for send -+ * + race around usewreserv -+ * + avoid allocating new page for each tiny-gram, see letter from ANK -+ * + rename ub_sock_lock -+ * + sk->sleep wait queue probably can be used for all wakeups, and -+ * sk->ub_wait is unnecessary -+ * + for UNIX sockets, the current algorithm will lead to -+ * UB_UNIX_MINBUF-sized messages only for non-blocking case -+ * - charge for af_packet sockets -+ * + all datagram sockets should be charged to NUMUNIXSOCK -+ * - we do not charge for skb copies and clones staying in device queues -+ * + live-lock if number of sockets is big and buffer limits are small -+ * [diff-ubc-dbllim3] -+ * - check that multiple readers/writers on the same socket won't cause fatal -+ * consequences -+ * - check allocation/charge orders -+ * + There is potential problem with callback_lock. In *snd_wakeup we take -+ * beancounter first, in sock_def_error_report - callback_lock first. -+ * then beancounter. This is not a problem if callback_lock taken -+ * readonly, but anyway... -+ * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator -+ * General kernel problems: -+ * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC -+ * notification won't get signals -+ * - datagram_poll looks racy -+ * -+ */ -+ -+#include <linux/net.h> -+#include <linux/slab.h> -+#include <linux/kmem_cache.h> -+#include <linux/gfp.h> -+#include <linux/err.h> -+#include <linux/socket.h> -+#include <linux/module.h> -+#include <linux/sched.h> -+ -+#include <net/sock.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_net.h> -+#include <ub/ub_debug.h> -+ -+ -+/* Skb truesize definition. Bad place. Den */ -+ -+static inline int skb_chargesize_head(struct sk_buff *skb) -+{ -+ return skb_charge_size(skb->end - skb->head + -+ sizeof(struct skb_shared_info)); -+} -+ -+int skb_charge_fullsize(struct sk_buff *skb) -+{ -+ int chargesize; -+ struct sk_buff *skbfrag; -+ -+ chargesize = skb_chargesize_head(skb) + -+ PAGE_SIZE * skb_shinfo(skb)->nr_frags; -+ if (likely(skb_shinfo(skb)->frag_list == NULL)) -+ return chargesize; -+ for (skbfrag = skb_shinfo(skb)->frag_list; -+ skbfrag != NULL; -+ skbfrag = skbfrag->next) { -+ chargesize += skb_charge_fullsize(skbfrag); -+ } -+ return chargesize; -+} -+EXPORT_SYMBOL(skb_charge_fullsize); -+ -+static int ub_sock_makewreserv_locked(struct sock *sk, -+ int bufid, int sockid, unsigned long size); -+ -+int __ub_too_many_orphans(struct sock *sk, int count) -+{ -+ struct user_beancounter *ub; -+ -+ if (sock_has_ubc(sk)) { -+ for (ub = sock_bc(sk)->ub; ub->parent != NULL; ub = ub->parent); -+ if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2) -+ return 1; -+ } -+ return 0; -+} -+ -+/* -+ * Queueing -+ */ -+ -+static void ub_sock_snd_wakeup(struct user_beancounter *ub) -+{ -+ struct list_head *p; -+ struct sock_beancounter *skbc; -+ struct sock *sk; -+ struct user_beancounter *cub; -+ unsigned long added; -+ -+ while (!list_empty(&ub->ub_other_sk_list)) { -+ p = ub->ub_other_sk_list.next; -+ skbc = list_entry(p, struct sock_beancounter, ub_sock_list); -+ sk = skbc_sock(skbc); -+ ub_debug(UBD_NET_SLEEP, "Found sock to wake up\n"); -+ added = -skbc->poll_reserv; -+ if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, -+ UB_NUMOTHERSOCK, skbc->ub_waitspc)) -+ break; -+ added += skbc->poll_reserv; -+ -+ /* -+ * See comments in ub_tcp_snd_wakeup. -+ * Locking note: both unix_write_space and -+ * sock_def_write_space take callback_lock themselves. -+ * We take it here just to be on the safe side and to -+ * act the same way as ub_tcp_snd_wakeup does. -+ */ -+ sk->sk_write_space(sk); -+ -+ list_del_init(&skbc->ub_sock_list); -+ -+ if (skbc->ub != ub && added) { -+ cub = get_beancounter(skbc->ub); -+ spin_unlock(&ub->ub_lock); -+ charge_beancounter_notop(cub, UB_OTHERSOCKBUF, added); -+ put_beancounter(cub); -+ spin_lock(&ub->ub_lock); -+ } -+ } -+} -+ -+static void ub_tcp_snd_wakeup(struct user_beancounter *ub) -+{ -+ struct list_head *p; -+ struct sock *sk; -+ struct sock_beancounter *skbc; -+ struct socket *sock; -+ struct user_beancounter *cub; -+ unsigned long added; -+ -+ while (!list_empty(&ub->ub_tcp_sk_list)) { -+ p = ub->ub_tcp_sk_list.next; -+ skbc = list_entry(p, struct sock_beancounter, ub_sock_list); -+ sk = skbc_sock(skbc); -+ -+ added = 0; -+ sock = sk->sk_socket; -+ if (sock == NULL) -+ /* sk being destroyed */ -+ goto cont; -+ -+ ub_debug(UBD_NET_SLEEP, -+ "Checking queue, waiting %lu, reserv %lu\n", -+ skbc->ub_waitspc, skbc->poll_reserv); -+ added = -skbc->poll_reserv; -+ if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, -+ UB_NUMTCPSOCK, skbc->ub_waitspc)) -+ break; -+ added += skbc->poll_reserv; -+ -+ /* -+ * Send async notifications and wake up. -+ * Locking note: we get callback_lock here because -+ * tcp_write_space is over-optimistic about calling context -+ * (socket lock is presumed). So we get the lock here although -+ * it belongs to the callback. -+ */ -+ sk->sk_write_space(sk); -+ -+cont: -+ list_del_init(&skbc->ub_sock_list); -+ -+ if (skbc->ub != ub && added) { -+ cub = get_beancounter(skbc->ub); -+ spin_unlock(&ub->ub_lock); -+ charge_beancounter_notop(cub, UB_TCPSNDBUF, added); -+ put_beancounter(cub); -+ spin_lock(&ub->ub_lock); -+ } -+ } -+} -+ -+void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size) -+{ -+ unsigned long flags; -+ struct sock_beancounter *skbc; -+ struct user_beancounter *ub; -+ unsigned long added_reserv; -+ -+ if (!sock_has_ubc(sk)) -+ return; -+ -+ skbc = sock_bc(sk); -+ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size); -+ added_reserv = -skbc->poll_reserv; -+ if (!ub_sock_makewreserv_locked(sk, res, bid2sid(res), size)) { -+ /* -+ * It looks a bit hackish, but it is compatible with both -+ * wait_for_xx_ubspace and poll. -+ * This __set_current_state is equivalent to a wakeup event -+ * right after spin_unlock_irqrestore. -+ */ -+ __set_current_state(TASK_RUNNING); -+ added_reserv += skbc->poll_reserv; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ if (added_reserv) -+ charge_beancounter_notop(skbc->ub, res, added_reserv); -+ return; -+ } -+ -+ ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n"); -+ skbc->ub_waitspc = size; -+ if (!list_empty(&skbc->ub_sock_list)) { -+ ub_debug(UBD_NET_SOCKET, -+ "re-adding socket to beancounter %p.\n", ub); -+ goto out; -+ } -+ -+ switch (res) { -+ case UB_TCPSNDBUF: -+ list_add_tail(&skbc->ub_sock_list, -+ &ub->ub_tcp_sk_list); -+ break; -+ case UB_OTHERSOCKBUF: -+ list_add_tail(&skbc->ub_sock_list, -+ &ub->ub_other_sk_list); -+ break; -+ default: -+ BUG(); -+ } -+out: -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+ -+/* -+ * Helpers -+ */ -+ -+void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, -+ unsigned long size, int resource) -+{ -+ if (!sock_has_ubc(sk)) -+ return; -+ -+ if (sock_bc(sk)->ub == NULL) -+ BUG(); -+ skb_bc(skb)->ub = sock_bc(sk)->ub; -+ skb_bc(skb)->charged = size; -+ skb_bc(skb)->resource = resource; -+ -+ /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */ -+ if (skb->sk == NULL) -+ skb->sk = sk; -+} -+ -+static inline void ub_skb_set_uncharge(struct sk_buff *skb) -+{ -+ skb_bc(skb)->ub = NULL; -+ skb_bc(skb)->charged = 0; -+ skb_bc(skb)->resource = 0; -+} -+ -+static inline void __uncharge_sockbuf(struct sock_beancounter *skbc, -+ struct user_beancounter *ub, int resource, unsigned long size) -+{ -+ if (ub != NULL) -+ __uncharge_beancounter_locked(ub, resource, size); -+ -+ if (skbc != NULL) { -+ if (skbc->ub_wcharged > size) -+ skbc->ub_wcharged -= size; -+ else -+ skbc->ub_wcharged = 0; -+ } -+} -+ -+static void ub_update_rmem_thres(struct sock_beancounter *skub) -+{ -+ struct user_beancounter *ub; -+ -+ if (skub && skub->ub) { -+ for (ub = skub->ub; ub->parent != NULL; ub = ub->parent); -+ ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier / -+ (ub->ub_parms[UB_NUMTCPSOCK].held + 1); -+ } -+} -+inline int ub_skb_alloc_bc(struct sk_buff *skb, int gfp_mask) -+{ -+ memset(skb_bc(skb), 0, sizeof(struct skb_beancounter)); -+ return 0; -+} -+ -+inline void ub_skb_free_bc(struct sk_buff *skb) -+{ -+} -+ -+ -+/* -+ * Charge socket number -+ */ -+ -+static inline int sk_alloc_beancounter(struct sock *sk) -+{ -+ struct sock_beancounter *skbc; -+ -+ skbc = sock_bc(sk); -+ memset(skbc, 0, sizeof(struct sock_beancounter)); -+ return 0; -+} -+ -+static inline void sk_free_beancounter(struct sock *sk) -+{ -+} -+ -+static int __sock_charge(struct sock *sk, int res) -+{ -+ struct sock_beancounter *skbc; -+ struct user_beancounter *ub; -+ -+ ub = get_exec_ub(); -+ if (ub == NULL) -+ return 0; -+ if (sk_alloc_beancounter(sk) < 0) -+ return -ENOMEM; -+ -+ skbc = sock_bc(sk); -+ INIT_LIST_HEAD(&skbc->ub_sock_list); -+ -+ if (charge_beancounter(ub, res, 1, UB_HARD) < 0) -+ goto out_limit; -+ -+ /* TCP listen sock or process keeps referrence to UB */ -+ skbc->ub = get_beancounter(ub); -+ return 0; -+ -+out_limit: -+ sk_free_beancounter(sk); -+ return -ENOMEM; -+} -+ -+int ub_tcp_sock_charge(struct sock *sk) -+{ -+ int ret; -+ -+ ret = __sock_charge(sk, UB_NUMTCPSOCK); -+ ub_update_rmem_thres(sock_bc(sk)); -+ -+ return ret; -+} -+ -+int ub_other_sock_charge(struct sock *sk) -+{ -+ return __sock_charge(sk, UB_NUMOTHERSOCK); -+} -+ -+EXPORT_SYMBOL(ub_other_sock_charge); -+ -+int ub_sock_charge(struct sock *sk, int family, int type) -+{ -+ return (IS_TCP_SOCK(family, type) ? -+ ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk)); -+} -+ -+/* -+ * Uncharge socket number -+ */ -+ -+void ub_sock_uncharge(struct sock *sk) -+{ -+ int is_tcp_sock; -+ unsigned long flags; -+ struct sock_beancounter *skbc; -+ struct user_beancounter *ub; -+ unsigned long reserv; -+ -+ if (!sock_has_ubc(sk)) -+ return; -+ -+ is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type); -+ skbc = sock_bc(sk); -+ ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk); -+ -+ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ if (!list_empty(&skbc->ub_sock_list)) { -+ ub_debug(UBD_NET_SOCKET, -+ "ub_sock_uncharge: removing from ub(%p) queue.\n", -+ skbc); -+ list_del_init(&skbc->ub_sock_list); -+ } -+ -+ reserv = skbc->poll_reserv; -+ __uncharge_beancounter_locked(ub, -+ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), -+ reserv); -+ __uncharge_beancounter_locked(ub, -+ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); -+ -+ /* The check sk->sk_family != PF_NETLINK is made as the skb is -+ * queued to the kernel end of socket while changed to the user one. -+ * Den */ -+ if (skbc->ub_wcharged > reserv && -+ sk->sk_family != PF_NETLINK) { -+ skbc->ub_wcharged -= reserv; -+ printk(KERN_WARNING -+ "ub_sock_uncharge: wch=%lu for ub %p (%d).\n", -+ skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid); -+ } else -+ skbc->ub_wcharged = 0; -+ skbc->poll_reserv = 0; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ uncharge_beancounter_notop(skbc->ub, -+ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), -+ reserv); -+ uncharge_beancounter_notop(skbc->ub, -+ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); -+ -+ put_beancounter(skbc->ub); -+ sk_free_beancounter(sk); -+} -+ -+/* -+ * Send - receive buffers -+ */ -+ -+/* Special case for netlink_dump - (un)charges precalculated size */ -+int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk) -+{ -+ int ret; -+ unsigned long chargesize; -+ -+ if (!sock_has_ubc(sk)) -+ return 0; -+ -+ chargesize = skb_charge_fullsize(skb); -+ ret = charge_beancounter(sock_bc(sk)->ub, -+ UB_DGRAMRCVBUF, chargesize, UB_HARD); -+ if (ret < 0) -+ return ret; -+ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); -+ return ret; -+} -+ -+/* -+ * Poll reserv accounting -+ */ -+static int ub_sock_makewreserv_locked(struct sock *sk, -+ int bufid, int sockid, unsigned long size) -+{ -+ unsigned long wcharge_added; -+ struct sock_beancounter *skbc; -+ struct user_beancounter *ub; -+ -+ if (!sock_has_ubc(sk)) -+ goto out; -+ -+ skbc = sock_bc(sk); -+ if (skbc->poll_reserv >= size) /* no work to be done */ -+ goto out; -+ -+ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); -+ ub->ub_parms[bufid].held += size - skbc->poll_reserv; -+ -+ wcharge_added = 0; -+ /* -+ * Logic: -+ * 1) when used memory hits barrier, we set wmem_pressure; -+ * wmem_pressure is reset under barrier/2; -+ * between barrier/2 and barrier we limit per-socket buffer growth; -+ * 2) each socket is guaranteed to get (limit-barrier)/maxsockets -+ * calculated on the base of memory eaten after the barrier is hit -+ */ -+ skbc = sock_bc(sk); -+ if (!ub_hfbarrier_hit(ub, bufid)) { -+ if (ub->ub_wmem_pressure) -+ ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 " -+ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", -+ sk, size, skbc->poll_reserv, -+ ub->ub_parms[bufid].held, -+ skbc->ub_wcharged, sk->sk_sndbuf); -+ ub->ub_wmem_pressure = 0; -+ } -+ if (ub_barrier_hit(ub, bufid)) { -+ if (!ub->ub_wmem_pressure) -+ ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 " -+ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", -+ sk, size, skbc->poll_reserv, -+ ub->ub_parms[bufid].held, -+ skbc->ub_wcharged, sk->sk_sndbuf); -+ ub->ub_wmem_pressure = 1; -+ wcharge_added = size - skbc->poll_reserv; -+ skbc->ub_wcharged += wcharge_added; -+ if (skbc->ub_wcharged * ub->ub_parms[sockid].limit + -+ ub->ub_parms[bufid].barrier > -+ ub->ub_parms[bufid].limit) -+ goto unroll; -+ } -+ if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit) -+ goto unroll; -+ -+ ub_adjust_maxheld(ub, bufid); -+ skbc->poll_reserv = size; -+out: -+ return 0; -+ -+unroll: -+ ub_debug(UBD_NET_SEND, -+ "makewres: deny " -+ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", -+ sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held, -+ skbc->ub_wcharged, sk->sk_sndbuf); -+ skbc->ub_wcharged -= wcharge_added; -+ ub->ub_parms[bufid].failcnt++; -+ ub->ub_parms[bufid].held -= size - skbc->poll_reserv; -+ return -ENOMEM; -+} -+ -+int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size) -+{ -+ struct sock_beancounter *skbc; -+ struct user_beancounter *ub; -+ unsigned long flags; -+ unsigned long added_reserv; -+ int err; -+ -+ skbc = sock_bc(sk); -+ -+ /* -+ * This function provides that there is sufficient reserve upon return -+ * only if sk has only one user. We can check poll_reserv without -+ * serialization and avoid locking if the reserve already exists. -+ */ -+ if (!sock_has_ubc(sk) || skbc->poll_reserv >= size) -+ return 0; -+ -+ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ added_reserv = -skbc->poll_reserv; -+ err = ub_sock_makewreserv_locked(sk, bufid, bid2sid(bufid), size); -+ added_reserv += skbc->poll_reserv; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ if (added_reserv) -+ charge_beancounter_notop(skbc->ub, bufid, added_reserv); -+ -+ return err; -+} -+ -+int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size) -+{ -+ struct sock_beancounter *skbc; -+ struct user_beancounter *ub; -+ unsigned long flags; -+ unsigned long added_reserv; -+ int err; -+ -+ if (!sock_has_ubc(sk)) -+ return 0; -+ -+ skbc = sock_bc(sk); -+ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ added_reserv = -skbc->poll_reserv; -+ err = ub_sock_makewreserv_locked(sk, bufid, bid2sid(bufid), size); -+ added_reserv += skbc->poll_reserv; -+ if (!err) -+ skbc->poll_reserv -= size; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ if (added_reserv) -+ charge_beancounter_notop(skbc->ub, bufid, added_reserv); -+ -+ return err; -+} -+ -+void ub_sock_ret_wreserv(struct sock *sk, int bufid, -+ unsigned long size, unsigned long ressize) -+{ -+ struct sock_beancounter *skbc; -+ struct user_beancounter *ub; -+ unsigned long extra; -+ unsigned long flags; -+ -+ if (!sock_has_ubc(sk)) -+ return; -+ -+ extra = 0; -+ skbc = sock_bc(sk); -+ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ skbc->poll_reserv += size; -+ if (skbc->poll_reserv > ressize) { -+ extra = skbc->poll_reserv - ressize; -+ __uncharge_beancounter_locked(ub, bufid, extra); -+ -+ if (skbc->ub_wcharged > skbc->poll_reserv - ressize) -+ skbc->ub_wcharged -= skbc->poll_reserv - ressize; -+ else -+ skbc->ub_wcharged = 0; -+ skbc->poll_reserv = ressize; -+ } -+ -+ ub_tcp_snd_wakeup(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ if (extra) -+ uncharge_beancounter_notop(skbc->ub, bufid, extra); -+} -+ -+long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size) -+{ -+ DECLARE_WAITQUEUE(wait, current); -+ -+ add_wait_queue(sk->sk_sleep, &wait); -+ for (;;) { -+ if (signal_pending(current)) -+ break; -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size)) -+ break; -+ -+ if (sk->sk_shutdown & SEND_SHUTDOWN) -+ break; -+ if (sk->sk_err) -+ break; -+ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size); -+ timeo = schedule_timeout(timeo); -+ } -+ __set_current_state(TASK_RUNNING); -+ remove_wait_queue(sk->sk_sleep, &wait); -+ return timeo; -+} -+ -+int ub_sock_makewres_other(struct sock *sk, unsigned long size) -+{ -+ return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size); -+} -+ -+int ub_sock_makewres_tcp(struct sock *sk, unsigned long size) -+{ -+ return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size); -+} -+ -+int ub_sock_getwres_other(struct sock *sk, unsigned long size) -+{ -+ return ub_sock_get_wreserv(sk, UB_OTHERSOCKBUF, size); -+} -+ -+int ub_sock_getwres_tcp(struct sock *sk, unsigned long size) -+{ -+ return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size); -+} -+ -+void ub_sock_retwres_other(struct sock *sk, unsigned long size, -+ unsigned long ressize) -+{ -+ ub_sock_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize); -+} -+ -+void ub_sock_retwres_tcp(struct sock *sk, unsigned long size, -+ unsigned long ressize) -+{ -+ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize); -+} -+ -+void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz) -+{ -+ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz); -+} -+ -+void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz) -+{ -+ ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz); -+} -+ -+void ub_sock_sndqueuedel(struct sock *sk) -+{ -+ struct sock_beancounter *skbc; -+ unsigned long flags; -+ -+ if (!sock_has_ubc(sk)) -+ return; -+ skbc = sock_bc(sk); -+ -+ /* race with write_space callback of other socket */ -+ spin_lock_irqsave(&skbc->ub->ub_lock, flags); -+ list_del_init(&skbc->ub_sock_list); -+ spin_unlock_irqrestore(&skbc->ub->ub_lock, flags); -+} -+ -+/* -+ * UB_DGRAMRCVBUF -+ */ -+ -+int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb) -+{ -+ unsigned long chargesize; -+ -+ if (!sock_has_ubc(sk)) -+ return 0; -+ -+ chargesize = skb_charge_fullsize(skb); -+ if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF, -+ chargesize, UB_HARD)) -+ return -ENOMEM; -+ -+ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); -+ return 0; -+} -+ -+EXPORT_SYMBOL(ub_sockrcvbuf_charge); -+ -+static void ub_sockrcvbuf_uncharge(struct sk_buff *skb) -+{ -+ uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF, -+ skb_bc(skb)->charged); -+ ub_skb_set_uncharge(skb); -+} -+ -+/* -+ * UB_TCPRCVBUF -+ */ -+static int charge_tcprcvbuf(struct sock *sk, struct sk_buff *skb, -+ enum severity strict) -+{ -+ int retval; -+ unsigned long flags; -+ struct user_beancounter *ub; -+ unsigned long chargesize; -+ -+ if (!sock_has_ubc(sk)) -+ return 0; -+ -+ /* -+ * Memory pressure reactions: -+ * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND) -+ * 2) set UB_RMEM_SHRINK and tcp_clamp_window() -+ * tcp_collapse_queues() if rmem_alloc > rcvbuf -+ * 3) drop OFO, tcp_purge_ofo() -+ * 4) drop all. -+ * Currently, we do #2 and #3 at once (which means that current -+ * collapsing of OFO queue in tcp_collapse_queues() is a waste of time, -+ * for example...) -+ * On memory pressure we jump from #0 to #3, and when the pressure -+ * subsides, to #1. -+ */ -+ retval = 0; -+ chargesize = skb_charge_fullsize(skb); -+ -+ for (ub = sock_bc(sk)->ub; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ub->ub_parms[UB_TCPRCVBUF].held += chargesize; -+ if (ub->ub_parms[UB_TCPRCVBUF].held > -+ ub->ub_parms[UB_TCPRCVBUF].barrier && -+ strict != UB_FORCE) -+ goto excess; -+ ub_adjust_maxheld(ub, UB_TCPRCVBUF); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+out: -+ if (retval == 0) { -+ charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF, -+ chargesize); -+ ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); -+ } -+ return retval; -+ -+excess: -+ ub->ub_rmem_pressure = UB_RMEM_SHRINK; -+ if (strict == UB_HARD) -+ retval = -ENOMEM; -+ if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit) -+ retval = -ENOMEM; -+ /* -+ * We try to leave numsock*maxadvmss as a reserve for sockets not -+ * queueing any data yet (if the difference between the barrier and the -+ * limit is enough for this reserve). -+ */ -+ if (ub->ub_parms[UB_TCPRCVBUF].held + -+ ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss -+ > ub->ub_parms[UB_TCPRCVBUF].limit && -+ atomic_read(&sk->sk_rmem_alloc)) -+ retval = -ENOMEM; -+ if (retval) { -+ ub->ub_parms[UB_TCPRCVBUF].held -= chargesize; -+ ub->ub_parms[UB_TCPRCVBUF].failcnt++; -+ } -+ ub_adjust_maxheld(ub, UB_TCPRCVBUF); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ goto out; -+} -+ -+int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb) -+{ -+ return charge_tcprcvbuf(sk, skb, UB_HARD); -+} -+ -+int ub_tcprcvbuf_charge_forced(struct sock *sk, struct sk_buff *skb) -+{ -+ return charge_tcprcvbuf(sk, skb, UB_FORCE); -+} -+EXPORT_SYMBOL(ub_tcprcvbuf_charge_forced); -+ -+static void ub_tcprcvbuf_uncharge(struct sk_buff *skb) -+{ -+ unsigned long flags; -+ unsigned long held, bar; -+ int prev_pres; -+ struct user_beancounter *ub; -+ -+ for (ub = skb_bc(skb)->ub; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) { -+ printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n", -+ skb_bc(skb)->charged, -+ ub, ub->ub_parms[UB_TCPRCVBUF].held); -+ /* ass-saving bung */ -+ skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held; -+ } -+ ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged; -+ held = ub->ub_parms[UB_TCPRCVBUF].held; -+ bar = ub->ub_parms[UB_TCPRCVBUF].barrier; -+ prev_pres = ub->ub_rmem_pressure; -+ if (held <= bar - (bar >> 2)) -+ ub->ub_rmem_pressure = UB_RMEM_EXPAND; -+ else if (held <= bar) -+ ub->ub_rmem_pressure = UB_RMEM_KEEP; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF, -+ skb_bc(skb)->charged); -+ ub_skb_set_uncharge(skb); -+} -+ -+ -+/* -+ * UB_OTHERSOCKBUF -+ */ -+ -+static void ub_socksndbuf_uncharge(struct sk_buff *skb) -+{ -+ unsigned long flags; -+ struct user_beancounter *ub, *cub; -+ struct sock_beancounter *sk_bc; -+ -+ /* resource was set. no check for ub required */ -+ cub = skb_bc(skb)->ub; -+ for (ub = cub; ub->parent != NULL; ub = ub->parent); -+ skb_bc(skb)->ub = NULL; -+ if (skb->sk != NULL) -+ sk_bc = sock_bc(skb->sk); -+ else -+ sk_bc = NULL; -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ __uncharge_sockbuf(sk_bc, ub, UB_OTHERSOCKBUF, -+ skb_bc(skb)->charged); -+ ub_sock_snd_wakeup(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, skb_bc(skb)->charged); -+ ub_skb_set_uncharge(skb); -+} -+ -+static void ub_tcpsndbuf_uncharge(struct sk_buff *skb) -+{ -+ unsigned long flags; -+ struct user_beancounter *ub, *cub; -+ -+ /* resource can be not set, called manually */ -+ cub = skb_bc(skb)->ub; -+ if (cub == NULL) -+ return; -+ for (ub = cub; ub->parent != NULL; ub = ub->parent); -+ skb_bc(skb)->ub = NULL; -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ __uncharge_sockbuf(sock_bc(skb->sk), ub, UB_TCPSNDBUF, -+ skb_bc(skb)->charged); -+ ub_tcp_snd_wakeup(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ uncharge_beancounter_notop(cub, UB_TCPSNDBUF, skb_bc(skb)->charged); -+ ub_skb_set_uncharge(skb); -+} -+ -+void ub_skb_uncharge(struct sk_buff *skb) -+{ -+ switch (skb_bc(skb)->resource) { -+ case UB_TCPSNDBUF: -+ ub_tcpsndbuf_uncharge(skb); -+ break; -+ case UB_TCPRCVBUF: -+ ub_tcprcvbuf_uncharge(skb); -+ break; -+ case UB_DGRAMRCVBUF: -+ ub_sockrcvbuf_uncharge(skb); -+ break; -+ case UB_OTHERSOCKBUF: -+ ub_socksndbuf_uncharge(skb); -+ break; -+ } -+} -+ -+EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */ -+ -+/* -+ * TCP send buffers accouting. Paged part -+ */ -+int ub_sock_tcp_chargepage(struct sock *sk) -+{ -+ struct sock_beancounter *skbc; -+ struct user_beancounter *ub; -+ unsigned long added; -+ unsigned long flags; -+ int err; -+ -+ if (!sock_has_ubc(sk)) -+ return 0; -+ -+ skbc = sock_bc(sk); -+ -+ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ /* Try to charge full page */ -+ err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, UB_NUMTCPSOCK, -+ PAGE_SIZE); -+ if (err == 0) { -+ skbc->poll_reserv -= PAGE_SIZE; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, PAGE_SIZE); -+ return 0; -+ } -+ -+ /* Try to charge page enough to satisfy sys_select. The possible -+ overdraft for the rest of the page is generally better then -+ requesting full page in tcp_poll. This should not happen -+ frequently. Den */ -+ added = -skbc->poll_reserv; -+ err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, UB_NUMTCPSOCK, -+ SOCK_MIN_UBCSPACE); -+ if (err < 0) { -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ return err; -+ } -+ __charge_beancounter_locked(ub, UB_TCPSNDBUF, -+ PAGE_SIZE - skbc->poll_reserv, -+ UB_FORCE); -+ added += PAGE_SIZE; -+ skbc->poll_reserv = 0; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added); -+ -+ return 0; -+ -+} -+ -+void ub_sock_tcp_detachpage(struct sock *sk) -+{ -+ struct sk_buff *skb; -+ -+ if (!sock_has_ubc(sk)) -+ return; -+ -+ /* The page is just detached from socket. The last skb in queue -+ with paged part holds referrence to it */ -+ skb = skb_peek_tail(&sk->sk_write_queue); -+ if (skb == NULL) { -+ /* If the queue is empty - all data is sent and page is about -+ to be freed */ -+ uncharge_beancounter(sock_bc(sk)->ub, UB_TCPSNDBUF, PAGE_SIZE); -+ return; -+ } -+ /* Last skb is a good aproximation for a last skb with paged part */ -+ skb_bc(skb)->charged += PAGE_SIZE; -+} -+ -+static int charge_tcpsndbuf(struct sock *sk, struct sk_buff *skb, -+ enum severity strict) -+{ -+ int ret; -+ unsigned long chargesize; -+ -+ if (!sock_has_ubc(sk)) -+ return 0; -+ -+ chargesize = skb_charge_fullsize(skb); -+ ret = charge_beancounter(sock_bc(sk)->ub, UB_TCPSNDBUF, chargesize, -+ strict); -+ if (ret < 0) -+ return ret; -+ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); -+ sock_bc(sk)->ub_wcharged += chargesize; -+ return ret; -+} -+ -+int ub_tcpsndbuf_charge(struct sock *sk, struct sk_buff *skb) -+{ -+ return charge_tcpsndbuf(sk, skb, UB_HARD); -+} -+ -+int ub_tcpsndbuf_charge_forced(struct sock *sk, struct sk_buff *skb) -+{ -+ return charge_tcpsndbuf(sk, skb, UB_FORCE); -+} -+EXPORT_SYMBOL(ub_tcpsndbuf_charge_forced); -+ -+/* -+ * Initialization staff -+ */ -+int __init skbc_cache_init(void) -+{ -+ return 0; -+} -diff -upr linux-2.6.16.orig/kernel/ub/ub_page_bc.c linux-2.6.16-026test009/kernel/ub/ub_page_bc.c ---- linux-2.6.16.orig/kernel/ub/ub_page_bc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_page_bc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,440 @@ -+/* -+ * kernel/ub/ub_page_bc.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/spinlock.h> -+#include <linux/slab.h> -+#include <linux/mm.h> -+#include <linux/gfp.h> -+#include <linux/vmalloc.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_hash.h> -+#include <ub/ub_vmpages.h> -+#include <ub/ub_page.h> -+ -+static kmem_cache_t *pb_cachep; -+static spinlock_t pb_lock = SPIN_LOCK_UNLOCKED; -+static struct page_beancounter **pb_hash_table; -+static unsigned int pb_hash_mask; -+ -+/* -+ * Auxiliary staff -+ */ -+ -+static inline struct page_beancounter *next_page_pb(struct page_beancounter *p) -+{ -+ return list_entry(p->page_list.next, struct page_beancounter, -+ page_list); -+} -+ -+static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p) -+{ -+ return list_entry(p->page_list.prev, struct page_beancounter, -+ page_list); -+} -+ -+/* -+ * Held pages manipulation -+ */ -+static inline void set_held_pages(struct user_beancounter *bc) -+{ -+ /* all three depend on ub_held_pages */ -+ __ub_update_physpages(bc); -+ __ub_update_oomguarpages(bc); -+ __ub_update_privvm(bc); -+} -+ -+static inline void do_dec_held_pages(struct user_beancounter *ub, int value) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ub->ub_held_pages -= value; -+ set_held_pages(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+static void dec_held_pages(struct user_beancounter *ub, int value) -+{ -+ for (; ub != NULL; ub = ub->parent) -+ do_dec_held_pages(ub, value); -+} -+ -+static inline void do_inc_held_pages(struct user_beancounter *ub, int value) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ub->ub_held_pages += value; -+ set_held_pages(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+static void inc_held_pages(struct user_beancounter *ub, int value) -+{ -+ for (; ub != NULL; ub = ub->parent) -+ do_inc_held_pages(ub, value); -+} -+ -+/* -+ * Alloc - free -+ */ -+ -+inline int pb_alloc(struct page_beancounter **pbc) -+{ -+ *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL); -+ if (*pbc != NULL) { -+ (*pbc)->next_hash = NULL; -+ (*pbc)->pb_magic = PB_MAGIC; -+ } -+ return (*pbc == NULL); -+} -+ -+inline void pb_free(struct page_beancounter **pb) -+{ -+ if (*pb != NULL) { -+ kmem_cache_free(pb_cachep, *pb); -+ *pb = NULL; -+ } -+} -+ -+void pb_free_list(struct page_beancounter **p_pb) -+{ -+ struct page_beancounter *list, *pb; -+ -+ list = *p_pb; -+ if (list == PBC_COPY_SAME) -+ return; -+ -+ while (list) { -+ pb = list; -+ list = list->next_hash; -+ pb_free(&pb); -+ } -+ *p_pb = NULL; -+} -+ -+/* -+ * head -> <new objs> -> <old objs> -> ... -+ */ -+static int __alloc_list(struct page_beancounter **head, int num) -+{ -+ struct page_beancounter *pb; -+ -+ while (num > 0) { -+ if (pb_alloc(&pb)) -+ return -1; -+ pb->next_hash = *head; -+ *head = pb; -+ num--; -+ } -+ -+ return num; -+} -+ -+/* -+ * Ensure that the list contains at least num elements. -+ * p_pb points to an initialized list, may be of the zero length. -+ * -+ * mm->page_table_lock should be held -+ */ -+int pb_alloc_list(struct page_beancounter **p_pb, int num) -+{ -+ struct page_beancounter *list; -+ -+ for (list = *p_pb; list != NULL && num; list = list->next_hash, num--); -+ if (!num) -+ return 0; -+ -+ /* -+ * *p_pb(after) *p_pb (before) -+ * \ \ -+ * <new objs> -...-> <old objs> -> ... -+ */ -+ if (__alloc_list(p_pb, num) < 0) -+ goto nomem; -+ return 0; -+ -+nomem: -+ pb_free_list(p_pb); -+ return -ENOMEM; -+} -+ -+/* -+ * Allocates a page_beancounter for each -+ * user_beancounter in a hash -+ */ -+int pb_alloc_all(struct page_beancounter **pbs) -+{ -+ int i, need_alloc; -+ unsigned long flags; -+ struct user_beancounter *ub; -+ -+ spin_lock_irqsave(&ub_hash_lock, flags); -+ need_alloc = 0; -+ for_each_beancounter(i, ub) -+ need_alloc++; -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ -+ if (!__alloc_list(pbs, need_alloc)) -+ return 0; -+ -+ pb_free_list(pbs); -+ return -ENOMEM; -+} -+ -+/* -+ * Hash routines -+ */ -+ -+static inline int pb_hash(struct user_beancounter *ub, struct page *page) -+{ -+ return (((unsigned long)ub << 16) + ((unsigned long)ub >> 16) + -+ (page_to_pfn(page) >> 7)) & pb_hash_mask; -+} -+ -+/* pb_lock should be held */ -+static inline void insert_pb(struct page_beancounter *p, struct page *page, -+ struct user_beancounter *ub, int hash) -+{ -+ p->page = page; -+ p->ub = get_beancounter(ub); -+ p->next_hash = pb_hash_table[hash]; -+ pb_hash_table[hash] = p; -+#ifdef CONFIG_UBC_DEBUG_KMEM -+ ub->ub_stat[smp_processor_id()].pbcs++; -+#endif -+} -+ -+/* -+ * Heart -+ */ -+ -+static int __pb_dup_ref(struct page *page, struct user_beancounter *bc, -+ int hash) -+{ -+ struct page_beancounter *p; -+ -+ for (p = pb_hash_table[hash]; -+ p != NULL && (p->page != page || p->ub != bc); -+ p = p->next_hash); -+ if (p == NULL) -+ return -1; -+ -+ PB_COUNT_INC(p->refcount); -+ return 0; -+} -+ -+static void __pb_add_ref(struct page *page, struct user_beancounter *bc, -+ struct page_beancounter **ppb, int hash) -+{ -+ struct page_beancounter *head, *p; -+ int shift; -+ -+ p = *ppb; -+ *ppb = p->next_hash; -+ -+ insert_pb(p, page, bc, hash); -+ head = page_pbc(page); -+ -+ if (head != NULL) { -+ /* -+ * Move the first element to the end of the list. -+ * List head (pb_head) is set to the next entry. -+ * Note that this code works even if head is the only element -+ * on the list (because it's cyclic). -+ */ -+ BUG_ON(head->pb_magic != PB_MAGIC); -+ page_pbc(page) = next_page_pb(head); -+ PB_SHIFT_INC(head->refcount); -+ shift = PB_SHIFT_GET(head->refcount); -+ /* -+ * Update user beancounter, the share of head has been changed. -+ * Note that the shift counter is taken after increment. -+ */ -+ dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift); -+ /* add the new page beancounter to the end of the list */ -+ list_add_tail(&p->page_list, &page_pbc(page)->page_list); -+ } else { -+ page_pbc(page) = p; -+ shift = 0; -+ INIT_LIST_HEAD(&p->page_list); -+ } -+ -+ p->refcount = PB_REFCOUNT_MAKE(shift, 1); -+ /* update user beancounter for the new page beancounter */ -+ inc_held_pages(bc, UB_PAGE_WEIGHT >> shift); -+} -+ -+void pb_add_ref(struct page *page, struct mm_struct *mm, -+ struct page_beancounter **p_pb) -+{ -+ int hash; -+ struct user_beancounter *bc; -+ -+ bc = mm->mm_ub; -+ if (bc == NULL) -+ return; -+ -+ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) -+ return; -+ -+ hash = pb_hash(bc, page); -+ -+ spin_lock(&pb_lock); -+ if (__pb_dup_ref(page, bc, hash)) -+ __pb_add_ref(page, bc, p_pb, hash); -+ spin_unlock(&pb_lock); -+} -+ -+void pb_dup_ref(struct page *page, struct mm_struct *mm, -+ struct page_beancounter **p_pb) -+{ -+ int hash; -+ struct user_beancounter *bc; -+ -+ bc = mm->mm_ub; -+ if (bc == NULL) -+ return; -+ -+ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) -+ return; -+ -+ hash = pb_hash(bc, page); -+ -+ spin_lock(&pb_lock); -+ if (page_pbc(page) == NULL) -+ /* -+ * pages like ZERO_PAGE must not be accounted in pbc -+ * so on fork we just skip them -+ */ -+ goto out_unlock; -+ -+ if (unlikely(*p_pb != PBC_COPY_SAME)) -+ __pb_add_ref(page, bc, p_pb, hash); -+ else if (unlikely(__pb_dup_ref(page, bc, hash))) -+ WARN_ON(1); -+out_unlock: -+ spin_unlock(&pb_lock); -+} -+ -+void pb_remove_ref(struct page *page, struct mm_struct *mm) -+{ -+ int hash; -+ struct user_beancounter *bc; -+ struct page_beancounter *p, **q; -+ int shift, shiftt; -+ -+ bc = mm->mm_ub; -+ if (bc == NULL) -+ return; -+ -+ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) -+ return; -+ -+ hash = pb_hash(bc, page); -+ -+ spin_lock(&pb_lock); -+ BUG_ON(page_pbc(page) != NULL && page_pbc(page)->pb_magic != PB_MAGIC); -+ for (q = pb_hash_table + hash, p = *q; -+ p != NULL && (p->page != page || p->ub != bc); -+ q = &p->next_hash, p = *q); -+ if (p == NULL) -+ goto out_unlock; -+ -+ PB_COUNT_DEC(p->refcount); -+ if (PB_COUNT_GET(p->refcount)) -+ /* -+ * More references from the same user beancounter exist. -+ * Nothing needs to be done. -+ */ -+ goto out_unlock; -+ -+ /* remove from the hash list */ -+ *q = p->next_hash; -+ -+ shift = PB_SHIFT_GET(p->refcount); -+ -+ dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift); -+ -+ if (page_pbc(page) == p) { -+ if (list_empty(&p->page_list)) -+ goto out_free; -+ page_pbc(page) = next_page_pb(p); -+ } -+ list_del(&p->page_list); -+ put_beancounter(p->ub); -+#ifdef CONFIG_UBC_DEBUG_KMEM -+ p->ub->ub_stat[smp_processor_id()].pbcs--; -+#endif -+ pb_free(&p); -+ -+ /* Now balance the list. Move the tail and adjust its shift counter. */ -+ p = prev_page_pb(page_pbc(page)); -+ shiftt = PB_SHIFT_GET(p->refcount); -+ page_pbc(page) = p; -+ PB_SHIFT_DEC(p->refcount); -+ -+ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); -+ -+ /* -+ * If the shift counter of the moved beancounter is different from the -+ * removed one's, repeat the procedure for one more tail beancounter -+ */ -+ if (shiftt > shift) { -+ p = prev_page_pb(page_pbc(page)); -+ page_pbc(page) = p; -+ PB_SHIFT_DEC(p->refcount); -+ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); -+ } -+ spin_unlock(&pb_lock); -+ return; -+ -+out_free: -+ page_pbc(page) = NULL; -+#ifdef CONFIG_UBC_DEBUG_KMEM -+ p->ub->ub_stat[smp_processor_id()].pbcs--; -+#endif -+ put_beancounter(p->ub); -+ pb_free(&p); -+out_unlock: -+ spin_unlock(&pb_lock); -+ return; -+} -+ -+struct user_beancounter *pb_grab_page_ub(struct page *page) -+{ -+ struct page_beancounter *pb; -+ struct user_beancounter *ub; -+ -+ spin_lock(&pb_lock); -+ pb = page_pbc(page); -+ ub = (pb == NULL ? ERR_PTR(-EINVAL) : -+ get_beancounter(pb->ub)); -+ spin_unlock(&pb_lock); -+ return ub; -+} -+ -+void __init ub_init_pbc(void) -+{ -+ unsigned long hash_size; -+ -+ pb_cachep = kmem_cache_create("page_beancounter", -+ sizeof(struct page_beancounter), 0, -+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); -+ hash_size = num_physpages >> 2; -+ for (pb_hash_mask = 1; -+ (hash_size & pb_hash_mask) != hash_size; -+ pb_hash_mask = (pb_hash_mask << 1) + 1); -+ hash_size = pb_hash_mask + 1; -+ printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size); -+ pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *)); -+ memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *)); -+} -diff -upr linux-2.6.16.orig/kernel/ub/ub_pages.c linux-2.6.16-026test009/kernel/ub/ub_pages.c ---- linux-2.6.16.orig/kernel/ub/ub_pages.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_pages.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,530 @@ -+/* -+ * kernel/ub/ub_pages.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/mm.h> -+#include <linux/highmem.h> -+#include <linux/virtinfo.h> -+#include <linux/module.h> -+#include <linux/shmem_fs.h> -+#include <linux/vmalloc.h> -+ -+#include <asm/pgtable.h> -+#include <asm/page.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_vmpages.h> -+ -+void warn_bad_rss(struct vm_area_struct *vma, unsigned long freed) -+{ -+ static struct ub_rate_info ri = { -+ .burst = 10, -+ .interval = 40 * HZ, -+ }; -+ struct user_beancounter *ub; -+ char ubuid[64] = "No UB"; -+ unsigned long vmrss; -+ -+ if (!ub_ratelimit(&ri)) -+ return; -+ -+ ub = vma->vm_mm->mm_ub; -+ if (ub) -+ print_ub_uid(ub, ubuid, sizeof(ubuid)); -+ -+ vmrss = get_vma_rss(vma) + freed; -+ printk(KERN_WARNING -+ "%s vm_rss: process pid %d comm %.20s flags %lx\n" -+ "vma %p/%p rss %lu/%lu freed %lu\n" -+ "flags %lx, ub %s\n", -+ vmrss > freed ? "Positive" : "Negative", -+ current->pid, current->comm, current->flags, -+ vma, vma->vm_mm, vmrss, vma_pages(vma), freed, -+ vma->vm_flags, ubuid); -+ dump_stack(); -+} -+ -+static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma, -+ pmd_t *pmd, unsigned long addr, unsigned long end, -+ unsigned long *ret) -+{ -+ pte_t *pte; -+ spinlock_t *ptl; -+ -+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); -+ do { -+ if (!pte_none(*pte) && pte_present(*pte)) -+ (*ret)++; -+ } while (pte++, addr += PAGE_SIZE, (addr != end)); -+ pte_unmap_unlock(pte - 1, ptl); -+ -+ return addr; -+} -+ -+static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma, -+ pud_t *pud, unsigned long addr, unsigned long end, -+ unsigned long *ret) -+{ -+ pmd_t *pmd; -+ unsigned long next; -+ -+ pmd = pmd_offset(pud, addr); -+ do { -+ next = pmd_addr_end(addr, end); -+ if (pmd_none_or_clear_bad(pmd)) -+ continue; -+ next = pages_in_pte_range(vma, pmd, addr, next, ret); -+ } while (pmd++, addr = next, (addr != end)); -+ -+ return addr; -+} -+ -+static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma, -+ pgd_t *pgd, unsigned long addr, unsigned long end, -+ unsigned long *ret) -+{ -+ pud_t *pud; -+ unsigned long next; -+ -+ pud = pud_offset(pgd, addr); -+ do { -+ next = pud_addr_end(addr, end); -+ if (pud_none_or_clear_bad(pud)) -+ continue; -+ next = pages_in_pmd_range(vma, pud, addr, next, ret); -+ } while (pud++, addr = next, (addr != end)); -+ -+ return addr; -+} -+ -+unsigned long pages_in_vma_range(struct vm_area_struct *vma, -+ unsigned long addr, unsigned long end) -+{ -+ pgd_t *pgd; -+ unsigned long next; -+ unsigned long ret; -+ -+ ret = 0; -+ BUG_ON(addr >= end); -+ pgd = pgd_offset(vma->vm_mm, addr); -+ do { -+ next = pgd_addr_end(addr, end); -+ if (pgd_none_or_clear_bad(pgd)) -+ continue; -+ next = pages_in_pud_range(vma, pgd, addr, next, &ret); -+ } while (pgd++, addr = next, (addr != end)); -+ return ret; -+} -+ -+void fastcall __ub_update_physpages(struct user_beancounter *ub) -+{ -+ ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages -+ + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT); -+ ub_adjust_maxheld(ub, UB_PHYSPAGES); -+} -+ -+void fastcall __ub_update_oomguarpages(struct user_beancounter *ub) -+{ -+ ub->ub_parms[UB_OOMGUARPAGES].held = -+ ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages; -+ ub_adjust_maxheld(ub, UB_OOMGUARPAGES); -+} -+ -+void fastcall __ub_update_privvm(struct user_beancounter *ub) -+{ -+ ub->ub_parms[UB_PRIVVMPAGES].held = -+ (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT) -+ + ub->ub_unused_privvmpages -+ + ub->ub_parms[UB_SHMPAGES].held; -+ ub_adjust_maxheld(ub, UB_PRIVVMPAGES); -+} -+ -+static inline int __charge_privvm_locked(struct user_beancounter *ub, -+ unsigned long s, enum severity strict) -+{ -+ if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0) -+ return -ENOMEM; -+ -+ ub->ub_unused_privvmpages += s; -+ return 0; -+} -+ -+static void __unused_privvm_dec_locked(struct user_beancounter *ub, -+ long size) -+{ -+ /* catch possible overflow */ -+ if (ub->ub_unused_privvmpages < size) { -+ uncharge_warn(ub, UB_UNUSEDPRIVVM, -+ size, ub->ub_unused_privvmpages); -+ size = ub->ub_unused_privvmpages; -+ } -+ ub->ub_unused_privvmpages -= size; -+ __ub_update_privvm(ub); -+} -+ -+void __ub_unused_privvm_dec(struct mm_struct *mm, long size) -+{ -+ unsigned long flags; -+ struct user_beancounter *ub; -+ -+ ub = mm->mm_ub; -+ if (ub == NULL) -+ return; -+ -+ for (; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ __unused_privvm_dec_locked(ub, size); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+void ub_unused_privvm_sub(struct mm_struct *mm, -+ struct vm_area_struct *vma, unsigned long count) -+{ -+ if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) -+ __ub_unused_privvm_dec(mm, count); -+} -+ -+void ub_unused_privvm_add(struct mm_struct *mm, -+ struct vm_area_struct *vma, unsigned long size) -+{ -+ unsigned long flags; -+ struct user_beancounter *ub; -+ -+ ub = mm->mm_ub; -+ if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) -+ return; -+ -+ for (; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ub->ub_unused_privvmpages += size; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+int ub_protected_charge(struct mm_struct *mm, unsigned long size, -+ unsigned long newflags, struct vm_area_struct *vma) -+{ -+ unsigned long flags; -+ struct file *file; -+ struct user_beancounter *ub; -+ -+ ub = mm->mm_ub; -+ if (ub == NULL) -+ return PRIVVM_NO_CHARGE; -+ -+ flags = vma->vm_flags; -+ if (!((newflags ^ flags) & VM_WRITE)) -+ return PRIVVM_NO_CHARGE; -+ -+ file = vma->vm_file; -+ if (!VM_UB_PRIVATE(newflags | VM_WRITE, file)) -+ return PRIVVM_NO_CHARGE; -+ -+ if (flags & VM_WRITE) -+ return PRIVVM_TO_SHARED; -+ -+ for (; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ if (__charge_privvm_locked(ub, size, UB_SOFT) < 0) -+ goto err; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ return PRIVVM_TO_PRIVATE; -+ -+err: -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ return PRIVVM_ERROR; -+} -+ -+int ub_memory_charge(struct mm_struct *mm, unsigned long size, -+ unsigned vm_flags, struct file *vm_file, int sv) -+{ -+ struct user_beancounter *ub, *ubl; -+ unsigned long flags; -+ -+ ub = mm->mm_ub; -+ if (ub == NULL) -+ return 0; -+ -+ size >>= PAGE_SHIFT; -+ if (size > UB_MAXVALUE) -+ return -EINVAL; -+ -+ BUG_ON(sv != UB_SOFT && sv != UB_HARD); -+ -+ if (vm_flags & VM_LOCKED) { -+ if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv)) -+ goto out_err; -+ } -+ if (VM_UB_PRIVATE(vm_flags, vm_file)) { -+ for (ubl = ub; ubl->parent != NULL; ubl = ubl->parent); -+ spin_lock_irqsave(&ubl->ub_lock, flags); -+ if (__charge_privvm_locked(ubl, size, sv)) -+ goto out_private; -+ spin_unlock_irqrestore(&ubl->ub_lock, flags); -+ } -+ return 0; -+ -+out_private: -+ spin_unlock_irqrestore(&ubl->ub_lock, flags); -+ if (vm_flags & VM_LOCKED) -+ uncharge_beancounter(ub, UB_LOCKEDPAGES, size); -+out_err: -+ return -ENOMEM; -+} -+ -+void ub_memory_uncharge(struct mm_struct *mm, unsigned long size, -+ unsigned vm_flags, struct file *vm_file) -+{ -+ struct user_beancounter *ub; -+ unsigned long flags; -+ -+ ub = mm->mm_ub; -+ if (ub == NULL) -+ return; -+ -+ size >>= PAGE_SHIFT; -+ -+ if (vm_flags & VM_LOCKED) -+ uncharge_beancounter(ub, UB_LOCKEDPAGES, size); -+ if (VM_UB_PRIVATE(vm_flags, vm_file)) { -+ for (; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ __unused_privvm_dec_locked(ub, size); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ } -+} -+ -+int ub_locked_charge(struct mm_struct *mm, unsigned long size) -+{ -+ struct user_beancounter *ub; -+ -+ ub = mm->mm_ub; -+ if (ub == NULL) -+ return 0; -+ -+ return charge_beancounter(ub, UB_LOCKEDPAGES, -+ size >> PAGE_SHIFT, UB_HARD); -+} -+ -+void ub_locked_uncharge(struct mm_struct *mm, unsigned long size) -+{ -+ struct user_beancounter *ub; -+ -+ ub = mm->mm_ub; -+ if (ub == NULL) -+ return; -+ -+ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); -+} -+ -+int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size) -+{ -+ struct user_beancounter *ub; -+ -+ ub = shi->shmi_ub; -+ if (ub == NULL) -+ return 0; -+ -+ return charge_beancounter(ub, UB_LOCKEDPAGES, -+ size >> PAGE_SHIFT, UB_HARD); -+} -+ -+void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size) -+{ -+ struct user_beancounter *ub; -+ -+ ub = shi->shmi_ub; -+ if (ub == NULL) -+ return; -+ -+ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); -+} -+ -+ -+static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ub->ub_tmpfs_respages++; -+ __ub_update_physpages(ub); -+ __ub_update_oomguarpages(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+void ub_tmpfs_respages_inc(struct shmem_inode_info *shi) -+{ -+ struct user_beancounter *ub; -+ -+ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) -+ do_ub_tmpfs_respages_inc(ub); -+} -+ -+static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub, -+ unsigned long size) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ /* catch possible overflow */ -+ if (ub->ub_tmpfs_respages < size) { -+ uncharge_warn(ub, UB_TMPFSPAGES, -+ size, ub->ub_tmpfs_respages); -+ size = ub->ub_tmpfs_respages; -+ } -+ ub->ub_tmpfs_respages -= size; -+ /* update values what is the most interesting */ -+ __ub_update_physpages(ub); -+ __ub_update_oomguarpages(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+void ub_tmpfs_respages_sub(struct shmem_inode_info *shi, -+ unsigned long size) -+{ -+ struct user_beancounter *ub; -+ -+ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) -+ do_ub_tmpfs_respages_sub(ub, size); -+} -+ -+int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size) -+{ -+ int ret; -+ unsigned long flags; -+ struct user_beancounter *ub; -+ -+ ub = shi->shmi_ub; -+ if (ub == NULL) -+ return 0; -+ -+ for (; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD); -+ if (ret == 0) -+ __ub_update_privvm(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ return ret; -+} -+ -+void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size) -+{ -+ unsigned long flags; -+ struct user_beancounter *ub; -+ -+ ub = shi->shmi_ub; -+ if (ub == NULL) -+ return; -+ -+ for (; ub->parent != NULL; ub = ub->parent); -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ __uncharge_beancounter_locked(ub, UB_SHMPAGES, size); -+ __ub_update_privvm(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+#ifdef CONFIG_USER_SWAP_ACCOUNTING -+static inline void do_ub_swapentry_inc(struct user_beancounter *ub) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ub->ub_swap_pages++; -+ __ub_update_oomguarpages(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num, -+ struct user_beancounter *ub) -+{ -+ si->swap_ubs[num] = get_beancounter(ub); -+ for (; ub != NULL; ub = ub->parent) -+ do_ub_swapentry_inc(ub); -+} -+EXPORT_SYMBOL(ub_swapentry_inc); -+ -+static inline void do_ub_swapentry_dec(struct user_beancounter *ub) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ if (ub->ub_swap_pages <= 0) -+ uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages); -+ else -+ ub->ub_swap_pages--; -+ __ub_update_oomguarpages(ub); -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+} -+ -+void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num) -+{ -+ struct user_beancounter *ub, *ubp; -+ -+ ub = si->swap_ubs[num]; -+ si->swap_ubs[num] = NULL; -+ for (ubp = ub; ubp != NULL; ubp = ubp->parent) -+ do_ub_swapentry_dec(ubp); -+ put_beancounter(ub); -+} -+EXPORT_SYMBOL(ub_swapentry_dec); -+ -+int ub_swap_init(struct swap_info_struct *si, pgoff_t num) -+{ -+ struct user_beancounter **ubs; -+ -+ ubs = vmalloc(num * sizeof(struct user_beancounter *)); -+ if (ubs == NULL) -+ return -ENOMEM; -+ -+ memset(ubs, 0, num * sizeof(struct user_beancounter *)); -+ si->swap_ubs = ubs; -+ return 0; -+} -+ -+void ub_swap_fini(struct swap_info_struct *si) -+{ -+ if (si->swap_ubs) { -+ vfree(si->swap_ubs); -+ si->swap_ubs = NULL; -+ } -+} -+#endif -+ -+static int vmguar_enough_memory(struct vnotifier_block *self, -+ unsigned long event, void *arg, int old_ret) -+{ -+ struct user_beancounter *ub; -+ -+ if (event != VIRTINFO_ENOUGHMEM) -+ return old_ret; -+ -+ for (ub = current->mm->mm_ub; ub->parent != NULL; ub = ub->parent); -+ if (ub->ub_parms[UB_PRIVVMPAGES].held > -+ ub->ub_parms[UB_VMGUARPAGES].barrier) -+ return old_ret; -+ -+ return NOTIFY_OK; -+} -+ -+static struct vnotifier_block vmguar_notifier_block = { -+ .notifier_call = vmguar_enough_memory -+}; -+ -+static int __init init_vmguar_notifier(void) -+{ -+ virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block); -+ return 0; -+} -+ -+static void __exit fini_vmguar_notifier(void) -+{ -+ virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block); -+} -+ -+module_init(init_vmguar_notifier); -+module_exit(fini_vmguar_notifier); -diff -upr linux-2.6.16.orig/kernel/ub/ub_proc.c linux-2.6.16-026test009/kernel/ub/ub_proc.c ---- linux-2.6.16.orig/kernel/ub/ub_proc.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_proc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,389 @@ -+/* -+ * linux/fs/proc/proc_ub.c -+ * -+ * Copyright (C) 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg> -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ * TODO: -+ * -+ * Changes: -+ */ -+ -+#include <linux/errno.h> -+#include <linux/sched.h> -+#include <linux/kernel.h> -+#include <linux/mm.h> -+#include <linux/proc_fs.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_hash.h> -+#include <ub/ub_debug.h> -+#include <ub/ub_page.h> -+ -+#include <asm/page.h> -+#include <asm/uaccess.h> -+ -+/* -+ * we have 8 format strings depending on: -+ * 1. BITS_PER_LONG -+ * 2. CONFIG_UBC_KEEP_UNUSED -+ * 3. resource number (see out_proc_beancounter) -+ */ -+ -+#ifdef CONFIG_UBC_KEEP_UNUSED -+#define REF_FORMAT "%5.5s %4i: %-12s " -+#define UID_HEAD_STR "uid ref" -+#else -+#define REF_FORMAT "%10.10s: %-12s " -+#define UID_HEAD_STR "uid" -+#endif -+#define REF2_FORMAT "%10s %-12s " -+ -+#if BITS_PER_LONG == 32 -+#define RES_FORMAT "%10lu %10lu %10lu %10lu %10lu" -+#define HEAD_FORMAT "%10s %10s %10s %10s %10s" -+#define UB_PROC_LINE_TEXT (10+2+12+1+10+1+10+1+10+1+10+1+10) -+#else -+#define RES_FORMAT "%20lu %20lu %20lu %20lu %20lu" -+#define HEAD_FORMAT "%20s %20s %20s %20s %20s" -+#define UB_PROC_LINE_TEXT (10+2+12+1+20+1+20+1+20+1+20+1+20) -+#endif -+ -+#define UB_PROC_LINE_LEN (UB_PROC_LINE_TEXT + 1) -+ -+static void out_proc_version(char *buf) -+{ -+ int len; -+ -+ len = sprintf(buf, "Version: 2.5"); -+ memset(buf + len, ' ', UB_PROC_LINE_TEXT - len); -+ buf[UB_PROC_LINE_TEXT] = '\n'; -+} -+ -+static void out_proc_head(char *buf) -+{ -+ sprintf(buf, REF2_FORMAT HEAD_FORMAT, -+ UID_HEAD_STR, "resource", "held", "maxheld", -+ "barrier", "limit", "failcnt"); -+ buf[UB_PROC_LINE_TEXT] = '\n'; -+} -+ -+static void out_proc_beancounter(char *buf, struct user_beancounter *ub, int r) -+{ -+ if (r == 0) { -+ char tmpbuf[64]; -+ print_ub_uid(ub, tmpbuf, sizeof(tmpbuf)); -+ sprintf(buf, REF_FORMAT RES_FORMAT, -+ tmpbuf, -+#ifdef CONFIG_UBC_KEEP_UNUSED -+ atomic_read(&ub->ub_refcount), -+#endif -+ ub_rnames[r], ub->ub_parms[r].held, -+ ub->ub_parms[r].maxheld, ub->ub_parms[r].barrier, -+ ub->ub_parms[r].limit, ub->ub_parms[r].failcnt); -+ } else -+ sprintf(buf, REF2_FORMAT RES_FORMAT, -+ "", ub_rnames[r], -+ ub->ub_parms[r].held, ub->ub_parms[r].maxheld, -+ ub->ub_parms[r].barrier, ub->ub_parms[r].limit, -+ ub->ub_parms[r].failcnt); -+ -+ buf[UB_PROC_LINE_TEXT] = '\n'; -+} -+ -+static int ub_accessible(struct user_beancounter *ub, -+ struct user_beancounter *exec_ub, -+ struct file *file) -+{ -+ struct user_beancounter *p, *q; -+ -+ for (p = exec_ub; p->parent != NULL; p = p->parent); -+ for (q = ub; q->parent != NULL; q = q->parent); -+ if (p != get_ub0() && q != p) -+ return 0; -+ if (ub->parent == NULL) -+ return 1; -+ return file->private_data == NULL ? 0 : 1; -+} -+ -+static ssize_t ub_proc_read(struct file *file, char *usrbuf, size_t len, -+ loff_t *poff) -+{ -+ ssize_t retval; -+ char *buf; -+ unsigned long flags; -+ int i, resource; -+ struct ub_hash_slot *slot; -+ struct user_beancounter *ub; -+ struct user_beancounter *exec_ub = get_exec_ub(); -+ loff_t n, off; -+ int rem, produced, job, tocopy; -+ const int is_capable = -+ (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)); -+ -+ retval = -ENOBUFS; -+ buf = (char *)__get_free_page(GFP_KERNEL); -+ if (buf == NULL) -+ goto out; -+ -+ retval = 0; -+ if (!is_capable) -+ goto out_free; -+ -+ off = *poff; -+ if (off < 0) /* can't happen, just in case */ -+ goto inval; -+ -+again: -+ i = 0; -+ slot = ub_hash; -+ n = off; /* The amount of data tp skip */ -+ produced = 0; -+ if (n < (UB_PROC_LINE_LEN * 2)) { -+ if (n < UB_PROC_LINE_LEN) { -+ out_proc_version(buf); -+ produced += UB_PROC_LINE_LEN; -+ n += UB_PROC_LINE_LEN; -+ } -+ out_proc_head(buf + produced); -+ produced += UB_PROC_LINE_LEN; -+ n += UB_PROC_LINE_LEN; -+ } -+ n -= (2 * UB_PROC_LINE_LEN); -+ spin_lock_irqsave(&ub_hash_lock, flags); -+ while (1) { -+ for (ub = slot->ubh_beans; -+ ub != NULL && n >= (UB_RESOURCES * UB_PROC_LINE_LEN); -+ ub = ub->ub_next) -+ if (is_capable && ub_accessible(ub, exec_ub, file)) -+ n -= (UB_RESOURCES * UB_PROC_LINE_LEN); -+ if (ub != NULL || ++i >= UB_HASH_SIZE) -+ break; -+ ++slot; -+ } -+ rem = n; /* the amount of the data in the buffer to skip */ -+ job = PAGE_SIZE - UB_PROC_LINE_LEN + 1; /* end of buffer data */ -+ if (len < job - rem) -+ job = rem + len; -+ while (ub != NULL && produced < job) { -+ if (is_capable && ub_accessible(ub, exec_ub, file)) -+ for (resource = 0; -+ produced < job && resource < UB_RESOURCES; -+ resource++, produced += UB_PROC_LINE_LEN) -+ { -+ out_proc_beancounter(buf + produced, -+ ub, resource); -+ } -+ if (produced >= job) -+ break; -+ /* Find the next beancounter to produce more data. */ -+ ub = ub->ub_next; -+ while (ub == NULL && ++i < UB_HASH_SIZE) { -+ ++slot; -+ ub = slot->ubh_beans; -+ } -+ } -+ -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ ub_debug(UBD_ALLOC, KERN_DEBUG "UB_PROC: produced %d, job %d, rem %d\n", -+ produced, job, rem); -+ -+ /* -+ * Temporary buffer `buf' contains `produced' bytes. -+ * Extract no more than `len' bytes at offset `rem'. -+ */ -+ if (produced <= rem) -+ goto out_free; -+ tocopy = produced - rem; -+ if (len < tocopy) -+ tocopy = len; -+ if (!tocopy) -+ goto out_free; -+ if (copy_to_user(usrbuf, buf + rem, tocopy)) -+ goto fault; -+ off += tocopy; /* can't overflow */ -+ *poff = off; -+ len -= tocopy; -+ retval += tocopy; -+ if (!len) -+ goto out_free; -+ usrbuf += tocopy; -+ goto again; -+ -+fault: -+ retval = -EFAULT; -+out_free: -+ free_page((unsigned long)buf); -+out: -+ return retval; -+ -+inval: -+ retval = -EINVAL; -+ goto out_free; -+} -+ -+static int ub_proc_open(struct inode *inode, struct file *file) -+{ -+ file->private_data = strcmp(file->f_dentry->d_name.name, -+ "user_beancounters") ? -+ (void *)-1 : NULL; -+ return 0; -+} -+ -+static struct file_operations ub_file_operations = { -+ .read = &ub_proc_read, -+ .open = &ub_proc_open -+}; -+ -+#ifdef CONFIG_UBC_DEBUG_KMEM -+#include <linux/seq_file.h> -+#include <linux/kmem_cache.h> -+ -+static void *ubd_start(struct seq_file *m, loff_t *pos) -+{ -+ loff_t n = *pos; -+ struct user_beancounter *ub; -+ long slot; -+ -+ spin_lock_irq(&ub_hash_lock); -+ for (slot = 0; slot < UB_HASH_SIZE; slot++) -+ for (ub = ub_hash[slot].ubh_beans; ub; ub = ub->ub_next) { -+ if (n == 0) { -+ m->private = (void *)slot; -+ return (void *)ub; -+ } -+ n--; -+ } -+ return NULL; -+} -+ -+static void *ubd_next(struct seq_file *m, void *p, loff_t *pos) -+{ -+ struct user_beancounter *ub; -+ long slot; -+ -+ ub = (struct user_beancounter *)p; -+ slot = (long)m->private; -+ -+ ++*pos; -+ ub = ub->ub_next; -+ while (1) { -+ for (; ub; ub = ub->ub_next) { -+ m->private = (void *)slot; -+ return (void *)ub; -+ } -+ slot++; -+ if (slot == UB_HASH_SIZE) -+ break; -+ ub = ub_hash[slot].ubh_beans; -+ } -+ return NULL; -+} -+ -+static void ubd_stop(struct seq_file *m, void *p) -+{ -+ spin_unlock_irq(&ub_hash_lock); -+} -+ -+#define PROC_LINE_FMT "\t%-17s\t%5lu\t%5lu\n" -+ -+static int ubd_show(struct seq_file *m, void *p) -+{ -+ struct user_beancounter *ub; -+ struct ub_cache_counter *cc; -+ long pages, vmpages, pbc, swap, unmap; -+ int i; -+ char id[64]; -+ -+ ub = (struct user_beancounter *)p; -+ print_ub_uid(ub, id, sizeof(id)); -+ seq_printf(m, "%s:%d\n", id, atomic_read(&ub->ub_refcount)); -+ -+ pages = vmpages = pbc = swap = unmap = 0; -+ for (i = 0; i < NR_CPUS; i++) { -+ pages += ub->ub_stat[i].pages_charged; -+ vmpages += ub->ub_stat[i].vmalloc_charged; -+ pbc += ub->ub_stat[i].pbcs; -+ swap += ub->ub_stat[i].swapin; -+ unmap += ub->ub_stat[i].unmap; -+ } -+ if (pages < 0) -+ pages = 0; -+ if (vmpages < 0) -+ vmpages = 0; -+ seq_printf(m, PROC_LINE_FMT, "pages", pages, PAGE_SIZE); -+ seq_printf(m, PROC_LINE_FMT, "vmalloced", vmpages, PAGE_SIZE); -+ -+ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_UNUSEDPRIVVM], -+ ub->ub_unused_privvmpages, PAGE_SIZE); -+ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_TMPFSPAGES], -+ ub->ub_tmpfs_respages, PAGE_SIZE); -+ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_SWAPPAGES], -+ ub->ub_swap_pages, PAGE_SIZE); -+ seq_printf(m, PROC_LINE_FMT, "pbcs", pbc, -+ (unsigned long)sizeof(struct page_beancounter)); -+ -+ seq_printf(m, PROC_LINE_FMT, "swapin", swap, 0UL); -+ seq_printf(m, PROC_LINE_FMT, "unmap", unmap, 0UL); -+ /* interrupts are disabled by locking ub_hash_lock */ -+ spin_lock(&cc_lock); -+ list_for_each_entry (cc, &ub->ub_cclist, ulist) { -+ kmem_cache_t *cachep; -+ -+ cachep = cc->cachep; -+ seq_printf(m, PROC_LINE_FMT, -+ cachep->name, -+ cc->counter, -+ (unsigned long)cachep->objuse); -+ } -+ spin_unlock(&cc_lock); -+ return 0; -+} -+ -+static struct seq_operations kmemdebug_op = { -+ .start = ubd_start, -+ .next = ubd_next, -+ .stop = ubd_stop, -+ .show = ubd_show, -+}; -+ -+static int kmem_debug_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &kmemdebug_op); -+} -+ -+static struct file_operations kmem_debug_ops = { -+ .open = kmem_debug_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+#endif -+ -+void __init ub_init_proc(void) -+{ -+ struct proc_dir_entry *entry; -+ -+ entry = create_proc_entry("user_beancounters", S_IRUGO, NULL); -+ if (entry) -+ entry->proc_fops = &ub_file_operations; -+ else -+ panic("Can't create /proc/user_beancounters entry!\n"); -+ -+ entry = create_proc_entry("user_beancounters_sub", S_IRUGO, NULL); -+ if (entry) -+ entry->proc_fops = &ub_file_operations; -+ else -+ panic("Can't create /proc/user_beancounters2 entry!\n"); -+ -+#ifdef CONFIG_UBC_DEBUG_KMEM -+ entry = create_proc_entry("user_beancounters_debug", S_IRUGO, NULL); -+ if (entry) -+ entry->proc_fops = &kmem_debug_ops; -+ else -+ panic("Can't create /proc/user_beancounters_debug entry!\n"); -+#endif -+} -diff -upr linux-2.6.16.orig/kernel/ub/ub_stat.c linux-2.6.16-026test009/kernel/ub/ub_stat.c ---- linux-2.6.16.orig/kernel/ub/ub_stat.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_stat.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,465 @@ -+/* -+ * kernel/ub/ub_stat.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <linux/timer.h> -+#include <linux/sched.h> -+#include <linux/init.h> -+#include <linux/jiffies.h> -+#include <linux/list.h> -+#include <linux/errno.h> -+#include <linux/suspend.h> -+ -+#include <asm/uaccess.h> -+#include <asm/param.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_hash.h> -+#include <ub/ub_stat.h> -+ -+static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED; -+static LIST_HEAD(ubs_notify_list); -+static long ubs_min_interval; -+static ubstattime_t ubs_start_time, ubs_end_time; -+static struct timer_list ubs_timer; -+ -+static int ubstat_get_list(void *buf, long size) -+{ -+ int retval; -+ unsigned long flags; -+ int slotnr; -+ struct ub_hash_slot *slot; -+ struct user_beancounter *ub, *last_ub; -+ long *page, *ptr, *end; -+ int len; -+ -+ page = (long *)__get_free_page(GFP_KERNEL); -+ if (page == NULL) -+ return -ENOMEM; -+ -+ retval = 0; -+ slotnr = 0; -+ slot = ub_hash; -+ last_ub = NULL; -+ while (1) { -+ ptr = page; -+ end = page + PAGE_SIZE / sizeof(*ptr); -+ -+ spin_lock_irqsave(&ub_hash_lock, flags); -+ if (last_ub == NULL) -+ ub = slot->ubh_beans; -+ else -+ ub = last_ub->ub_next; -+ while (1) { -+ for (; ub != NULL; ub = ub->ub_next) { -+ if (ub->parent != NULL) -+ continue; -+ *ptr++ = ub->ub_uid; -+ if (ptr == end) -+ break; -+ } -+ if (ptr == end) -+ break; -+ ++slot; -+ if (++slotnr >= UB_HASH_SIZE) -+ break; -+ ub = slot->ubh_beans; -+ } -+ if (ptr == page) -+ goto out_unlock; -+ if (ub != NULL) -+ get_beancounter(ub); -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ -+ if (last_ub != NULL) -+ put_beancounter(last_ub); -+ last_ub = ub; /* last visited beancounter in the slot */ -+ -+ len = min_t(long, (ptr - page) * sizeof(*ptr), size); -+ if (copy_to_user(buf, page, len)) { -+ retval = -EFAULT; -+ break; -+ } -+ retval += len; -+ if (len < PAGE_SIZE) -+ break; -+ buf += len; -+ size -= len; -+ } -+out: -+ if (last_ub != NULL) -+ put_beancounter(last_ub); -+ free_page((unsigned long)page); -+ return retval; -+ -+out_unlock: -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+ goto out; -+} -+ -+static int ubstat_gettime(void *buf, long size) -+{ -+ ubgettime_t data; -+ int retval; -+ -+ spin_lock(&ubs_notify_lock); -+ data.start_time = ubs_start_time; -+ data.end_time = ubs_end_time; -+ data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ; -+ spin_unlock(&ubs_notify_lock); -+ -+ retval = min_t(long, sizeof(data), size); -+ if (copy_to_user(buf, &data, retval)) -+ retval = -EFAULT; -+ return retval; -+} -+ -+static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf) -+{ -+ struct { -+ ubstattime_t start_time; -+ ubstattime_t end_time; -+ ubstatparm_t param[1]; -+ } *data; -+ -+ data = kbuf; -+ data->start_time = ubs_start_time; -+ data->end_time = ubs_end_time; -+ -+ data->param[0].maxheld = ub->ub_store[res].maxheld; -+ data->param[0].failcnt = ub->ub_store[res].failcnt; -+ -+ return sizeof(*data); -+} -+ -+static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size) -+{ -+ int wrote; -+ struct { -+ ubstattime_t start_time; -+ ubstattime_t end_time; -+ ubstatparm_t param[UB_RESOURCES]; -+ } *data; -+ int resource; -+ -+ data = kbuf; -+ data->start_time = ubs_start_time; -+ data->end_time = ubs_end_time; -+ wrote = sizeof(data->start_time) + sizeof(data->end_time); -+ -+ for (resource = 0; resource < UB_RESOURCES; resource++) { -+ if (size < wrote + sizeof(data->param[resource])) -+ break; -+ data->param[resource].maxheld = ub->ub_store[resource].maxheld; -+ data->param[resource].failcnt = ub->ub_store[resource].failcnt; -+ wrote += sizeof(data->param[resource]); -+ } -+ -+ return wrote; -+} -+ -+static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf, -+ int size) -+{ -+ int wrote; -+ struct { -+ ubstattime_t start_time; -+ ubstattime_t end_time; -+ ubstatparmf_t param[UB_RESOURCES]; -+ } *data; -+ int resource; -+ -+ data = kbuf; -+ data->start_time = ubs_start_time; -+ data->end_time = ubs_end_time; -+ wrote = sizeof(data->start_time) + sizeof(data->end_time); -+ -+ for (resource = 0; resource < UB_RESOURCES; resource++) { -+ if (size < wrote + sizeof(data->param[resource])) -+ break; -+ /* The beginning of ubstatparmf_t matches struct ubparm. */ -+ memcpy(&data->param[resource], &ub->ub_store[resource], -+ sizeof(ub->ub_store[resource])); -+ data->param[resource].__unused1 = 0; -+ data->param[resource].__unused2 = 0; -+ wrote += sizeof(data->param[resource]); -+ } -+ return wrote; -+} -+ -+static int ubstat_get_stat(struct user_beancounter *ub, long cmd, -+ void *buf, long size) -+{ -+ void *kbuf; -+ int retval; -+ -+ kbuf = (void *)__get_free_page(GFP_KERNEL); -+ if (kbuf == NULL) -+ return -ENOMEM; -+ -+ spin_lock(&ubs_notify_lock); -+ switch (UBSTAT_CMD(cmd)) { -+ case UBSTAT_READ_ONE: -+ retval = -EINVAL; -+ if (UBSTAT_PARMID(cmd) >= UB_RESOURCES) -+ break; -+ retval = ubstat_do_read_one(ub, -+ UBSTAT_PARMID(cmd), kbuf); -+ break; -+ case UBSTAT_READ_ALL: -+ retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE); -+ break; -+ case UBSTAT_READ_FULL: -+ retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE); -+ break; -+ default: -+ retval = -EINVAL; -+ } -+ spin_unlock(&ubs_notify_lock); -+ -+ if (retval > 0) { -+ retval = min_t(long, retval, size); -+ if (copy_to_user(buf, kbuf, retval)) -+ retval = -EFAULT; -+ } -+ -+ free_page((unsigned long)kbuf); -+ return retval; -+} -+ -+static int ubstat_handle_notifrq(ubnotifrq_t *req) -+{ -+ int retval; -+ struct ub_stat_notify *new_notify; -+ struct list_head *entry; -+ struct task_struct *tsk_to_free; -+ -+ new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL); -+ if (new_notify == NULL) -+ return -ENOMEM; -+ -+ tsk_to_free = NULL; -+ INIT_LIST_HEAD(&new_notify->list); -+ -+ spin_lock(&ubs_notify_lock); -+ list_for_each(entry, &ubs_notify_list) { -+ struct ub_stat_notify *notify; -+ -+ notify = list_entry(entry, struct ub_stat_notify, list); -+ if (notify->task == current) { -+ kfree(new_notify); -+ new_notify = notify; -+ break; -+ } -+ } -+ -+ retval = -EINVAL; -+ if (req->maxinterval < 1) -+ goto out_unlock; -+ if (req->maxinterval > TIME_MAX_SEC) -+ req->maxinterval = TIME_MAX_SEC; -+ if (req->maxinterval < ubs_min_interval) { -+ unsigned long dif; -+ -+ ubs_min_interval = req->maxinterval; -+ dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ; -+ if (dif > req->maxinterval) -+ mod_timer(&ubs_timer, -+ ubs_timer.expires - -+ (dif - req->maxinterval) * HZ); -+ } -+ -+ if (entry != &ubs_notify_list) { -+ list_del(&new_notify->list); -+ tsk_to_free = new_notify->task; -+ } -+ if (req->signum) { -+ new_notify->task = current; -+ get_task_struct(new_notify->task); -+ new_notify->signum = req->signum; -+ list_add(&new_notify->list, &ubs_notify_list); -+ } else -+ kfree(new_notify); -+ retval = 0; -+out_unlock: -+ spin_unlock(&ubs_notify_lock); -+ if (tsk_to_free != NULL) -+ put_task_struct(tsk_to_free); -+ return retval; -+} -+ -+/* -+ * former sys_ubstat -+ */ -+long do_ubstat(int func, unsigned long arg1, unsigned long arg2, void *buf, -+ long size) -+{ -+ int retval; -+ struct user_beancounter *ub; -+ -+ if (func == UBSTAT_UBPARMNUM) -+ return UB_RESOURCES; -+ if (func == UBSTAT_UBLIST) -+ return ubstat_get_list(buf, size); -+ if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))) -+ return -EPERM; -+ -+ if (func == UBSTAT_GETTIME) { -+ retval = ubstat_gettime(buf, size); -+ goto notify; -+ } -+ -+ ub = get_exec_ub(); -+ if (ub != NULL && ub->ub_uid == arg1) -+ get_beancounter(ub); -+ else /* FIXME must be if (ve_is_super) */ -+ ub = get_beancounter_byuid(arg1, 0); -+ -+ if (ub == NULL) -+ return -ESRCH; -+ -+ retval = ubstat_get_stat(ub, func, buf, size); -+ put_beancounter(ub); -+notify: -+ /* Handle request for notification */ -+ if (retval >= 0) { -+ ubnotifrq_t notifrq; -+ int err; -+ -+ err = -EFAULT; -+ if (!copy_from_user(¬ifrq, (void *)arg2, sizeof(notifrq))) -+ err = ubstat_handle_notifrq(¬ifrq); -+ if (err) -+ retval = err; -+ } -+ -+ return retval; -+} -+ -+static void ubstat_save_onestat(struct user_beancounter *ub) -+{ -+ int resource; -+ -+ /* called with local irq disabled */ -+ spin_lock(&ub->ub_lock); -+ for (resource = 0; resource < UB_RESOURCES; resource++) { -+ memcpy(&ub->ub_store[resource], &ub->ub_parms[resource], -+ sizeof(struct ubparm)); -+ ub->ub_parms[resource].minheld = -+ ub->ub_parms[resource].maxheld = -+ ub->ub_parms[resource].held; -+ } -+ spin_unlock(&ub->ub_lock); -+} -+ -+static void ubstat_save_statistics(void) -+{ -+ unsigned long flags; -+ int i; -+ struct user_beancounter *ub; -+ -+ spin_lock_irqsave(&ub_hash_lock, flags); -+ for_each_beancounter(i, ub) -+ ubstat_save_onestat(ub); -+ spin_unlock_irqrestore(&ub_hash_lock, flags); -+} -+ -+static void ubstatd_timeout(unsigned long __data) -+{ -+ struct task_struct *p; -+ -+ p = (struct task_struct *) __data; -+ wake_up_process(p); -+} -+ -+/* -+ * Safe wrapper for send_sig. It prevents a race with release_task -+ * for sighand. -+ * Should be called under tasklist_lock. -+ */ -+static void task_send_sig(struct ub_stat_notify *notify) -+{ -+ if (likely(notify->task->sighand != NULL)) -+ send_sig(notify->signum, notify->task, 1); -+} -+ -+static inline void do_notifies(void) -+{ -+ LIST_HEAD(notif_free_list); -+ struct ub_stat_notify *notify; -+ struct ub_stat_notify *tmp; -+ -+ spin_lock(&ubs_notify_lock); -+ ubs_start_time = ubs_end_time; -+ /* -+ * the expression below relies on time being unsigned long and -+ * arithmetic promotion rules -+ */ -+ ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ; -+ mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ); -+ ubs_min_interval = TIME_MAX_SEC; -+ /* save statistics accumulated for the interval */ -+ ubstat_save_statistics(); -+ /* send signals */ -+ read_lock(&tasklist_lock); -+ while (!list_empty(&ubs_notify_list)) { -+ notify = list_entry(ubs_notify_list.next, -+ struct ub_stat_notify, list); -+ task_send_sig(notify); -+ list_del(¬ify->list); -+ list_add(¬ify->list, ¬if_free_list); -+ } -+ read_unlock(&tasklist_lock); -+ spin_unlock(&ubs_notify_lock); -+ -+ list_for_each_entry_safe(notify, tmp, ¬if_free_list, list) { -+ put_task_struct(notify->task); -+ kfree(notify); -+ } -+} -+ -+/* -+ * Kernel thread -+ */ -+static int ubstatd(void *unused) -+{ -+ /* daemonize call will take care of signals */ -+ daemonize("ubstatd"); -+ -+ ubs_timer.data = (unsigned long)current; -+ ubs_timer.function = ubstatd_timeout; -+ add_timer(&ubs_timer); -+ -+ while (1) { -+ set_task_state(current, TASK_INTERRUPTIBLE); -+ if (time_after(ubs_timer.expires, jiffies)) { -+ schedule(); -+ try_to_freeze(); -+ continue; -+ } -+ -+ __set_task_state(current, TASK_RUNNING); -+ do_notifies(); -+ } -+ return 0; -+} -+ -+static int __init ubstatd_init(void) -+{ -+ init_timer(&ubs_timer); -+ ubs_timer.expires = TIME_MAX_JIF; -+ ubs_min_interval = TIME_MAX_SEC; -+ ubs_start_time = ubs_end_time = 0; -+ -+ kernel_thread(ubstatd, NULL, 0); -+ return 0; -+} -+ -+module_init(ubstatd_init); -diff -upr linux-2.6.16.orig/kernel/ub/ub_sys.c linux-2.6.16-026test009/kernel/ub/ub_sys.c ---- linux-2.6.16.orig/kernel/ub/ub_sys.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ub/ub_sys.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,154 @@ -+/* -+ * kernel/ub/ub_sys.c -+ * -+ * Copyright (C) 2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/config.h> -+#include <asm/uaccess.h> -+ -+#include <ub/beancounter.h> -+ -+#ifndef CONFIG_USER_RESOURCE -+asmlinkage long sys_getluid(void) -+{ -+ return -ENOSYS; -+} -+ -+asmlinkage long sys_setluid(uid_t uid) -+{ -+ return -ENOSYS; -+} -+ -+asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, -+ unsigned long *limits) -+{ -+ return -ENOSYS; -+} -+ -+asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, -+ void *buf, long size) -+{ -+ return -ENOSYS; -+} -+#else /* CONFIG_USER_RESOURCE */ -+ -+/* -+ * The (rather boring) getluid syscall -+ */ -+asmlinkage long sys_getluid(void) -+{ -+ struct user_beancounter *ub; -+ -+ ub = get_exec_ub(); -+ if (ub == NULL) -+ return -EINVAL; -+ -+ return ub->ub_uid; -+} -+ -+/* -+ * The setluid syscall -+ */ -+asmlinkage long sys_setluid(uid_t uid) -+{ -+ struct user_beancounter *ub; -+ struct task_beancounter *task_bc; -+ int error; -+ -+ task_bc = ¤t->task_bc; -+ -+ /* You may not disown a setluid */ -+ error = -EINVAL; -+ if (uid == (uid_t)-1) -+ goto out; -+ -+ /* You may only set an ub as root */ -+ error = -EPERM; -+ if (!capable(CAP_SETUID)) -+ goto out; -+ -+ /* Ok - set up a beancounter entry for this user */ -+ error = -ENOBUFS; -+ ub = get_beancounter_byuid(uid, 1); -+ if (ub == NULL) -+ goto out; -+ -+ ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) " -+ "for %.20s pid %d\n", -+ ub, atomic_read(&ub->ub_refcount), -+ current->comm, current->pid); -+ /* install bc */ -+ put_beancounter(task_bc->exec_ub); -+ task_bc->exec_ub = ub; -+ put_beancounter(task_bc->fork_sub); -+ task_bc->fork_sub = get_beancounter(ub); -+ error = 0; -+out: -+ return error; -+} -+ -+/* -+ * The setbeanlimit syscall -+ */ -+asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, -+ unsigned long *limits) -+{ -+ int error; -+ unsigned long flags; -+ struct user_beancounter *ub; -+ unsigned long new_limits[2]; -+ -+ error = -EPERM; -+ if(!capable(CAP_SYS_RESOURCE)) -+ goto out; -+ -+ if (!ve_is_super(get_exec_env())) -+ goto out; -+ -+ error = -EINVAL; -+ if (resource >= UB_RESOURCES) -+ goto out; -+ -+ error = -EFAULT; -+ if (copy_from_user(&new_limits, limits, sizeof(new_limits))) -+ goto out; -+ -+ error = -EINVAL; -+ if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE) -+ goto out; -+ -+ error = -ENOENT; -+ ub = get_beancounter_byuid(uid, 0); -+ if (ub == NULL) { -+ ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid); -+ goto out; -+ } -+ -+ spin_lock_irqsave(&ub->ub_lock, flags); -+ ub->ub_parms[resource].barrier = new_limits[0]; -+ ub->ub_parms[resource].limit = new_limits[1]; -+ spin_unlock_irqrestore(&ub->ub_lock, flags); -+ -+ put_beancounter(ub); -+ -+ error = 0; -+out: -+ return error; -+} -+ -+extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, -+ void *buf, long size); -+asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, -+ void *buf, long size) -+{ -+ if (!ve_is_super(get_exec_env())) -+ return -EPERM; -+ -+ return do_ubstat(func, arg1, arg2, buf, size); -+} -+#endif -diff -upr linux-2.6.16.orig/kernel/user.c linux-2.6.16-026test009/kernel/user.c ---- linux-2.6.16.orig/kernel/user.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/user.c 2006-04-19 15:02:12.000000000 +0400 -@@ -14,6 +14,7 @@ - #include <linux/bitops.h> - #include <linux/key.h> - #include <linux/interrupt.h> -+#include <linux/module.h> - - /* - * UID task count cache, to get fast user lookup in "alloc_uid" -@@ -24,7 +25,20 @@ - #define UIDHASH_SZ (1 << UIDHASH_BITS) - #define UIDHASH_MASK (UIDHASH_SZ - 1) - #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) --#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) -+#define __uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) -+ -+#ifdef CONFIG_VE -+#define UIDHASH_MASK_VE (UIDHASH_SZ_VE - 1) -+#define __uidhashfn_ve(uid) (((uid >> UIDHASH_BITS_VE) ^ uid) & \ -+ UIDHASH_MASK_VE) -+#define __uidhashentry_ve(uid, envid) ((envid)->uidhash_table + \ -+ __uidhashfn_ve(uid)) -+#define uidhashentry_ve(uid) (ve_is_super(get_exec_env()) ? \ -+ __uidhashentry(uid) : \ -+ __uidhashentry_ve(uid, get_exec_env())) -+#else -+#define uidhashentry_ve(uid) __uidhashentry(uid) -+#endif - - static kmem_cache_t *uid_cachep; - static struct list_head uidhash_table[UIDHASH_SZ]; -@@ -96,7 +110,7 @@ struct user_struct *find_user(uid_t uid) - unsigned long flags; - - spin_lock_irqsave(&uidhash_lock, flags); -- ret = uid_hash_find(uid, uidhashentry(uid)); -+ ret = uid_hash_find(uid, uidhashentry_ve(uid)); - spin_unlock_irqrestore(&uidhash_lock, flags); - return ret; - } -@@ -115,10 +129,11 @@ void free_uid(struct user_struct *up) - } - local_irq_restore(flags); - } -+EXPORT_SYMBOL_GPL(free_uid); - - struct user_struct * alloc_uid(uid_t uid) - { -- struct list_head *hashent = uidhashentry(uid); -+ struct list_head *hashent = uidhashentry_ve(uid); - struct user_struct *up; - - spin_lock_irq(&uidhash_lock); -@@ -168,6 +183,7 @@ struct user_struct * alloc_uid(uid_t uid - } - return up; - } -+EXPORT_SYMBOL_GPL(alloc_uid); - - void switch_uid(struct user_struct *new_user) - { -@@ -186,21 +202,21 @@ void switch_uid(struct user_struct *new_ - free_uid(old_user); - suid_keys(current); - } -- -+EXPORT_SYMBOL_GPL(switch_uid); - - static int __init uid_cache_init(void) - { - int n; - - uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), -- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); -+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); - - for(n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(uidhash_table + n); - - /* Insert the root user immediately (init already runs as root) */ - spin_lock_irq(&uidhash_lock); -- uid_hash_insert(&root_user, uidhashentry(0)); -+ uid_hash_insert(&root_user, __uidhashentry(0)); - spin_unlock_irq(&uidhash_lock); - - return 0; -diff -upr linux-2.6.16.orig/kernel/ve.c linux-2.6.16-026test009/kernel/ve.c ---- linux-2.6.16.orig/kernel/ve.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/ve.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,188 @@ -+/* -+ * linux/kernel/ve.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+/* -+ * 've.c' helper file performing VE sub-system initialization -+ */ -+ -+#include <linux/sched.h> -+#include <linux/delay.h> -+#include <linux/capability.h> -+#include <linux/ve.h> -+#include <linux/smp_lock.h> -+#include <linux/init.h> -+ -+#include <linux/errno.h> -+#include <linux/unistd.h> -+#include <linux/slab.h> -+#include <linux/sys.h> -+#include <linux/kdev_t.h> -+#include <linux/termios.h> -+#include <linux/tty_driver.h> -+#include <linux/netdevice.h> -+#include <linux/utsname.h> -+#include <linux/proc_fs.h> -+#include <linux/kernel_stat.h> -+#include <linux/module.h> -+#include <linux/rcupdate.h> -+#include <linux/ve_proto.h> -+#include <linux/ve_owner.h> -+#include <linux/devpts_fs.h> -+ -+#include <linux/nfcalls.h> -+ -+unsigned long vz_rstamp = 0x37e0f59d; -+ -+#ifdef CONFIG_MODULES -+struct module no_module = { .state = MODULE_STATE_GOING }; -+EXPORT_SYMBOL(no_module); -+#endif -+ -+#ifdef CONFIG_VE -+ -+DCL_VE_OWNER(SKB, struct sk_buff, owner_env) -+DCL_VE_OWNER(SK, struct sock, sk_owner_env) -+DCL_VE_OWNER(TW, struct tcp_tw_bucket, tw_owner_env) -+DCL_VE_OWNER(FILP, struct file, owner_env) -+DCL_VE_OWNER(FSTYPE, struct file_system_type, owner_env) -+ -+#if defined(CONFIG_VE_IPTABLES) -+INIT_KSYM_MODULE(x_tables); -+INIT_KSYM_MODULE(xt_tcpudp); -+INIT_KSYM_MODULE(ip_tables); -+INIT_KSYM_MODULE(iptable_filter); -+INIT_KSYM_MODULE(iptable_mangle); -+INIT_KSYM_MODULE(xt_limit); -+INIT_KSYM_MODULE(ipt_multiport); -+INIT_KSYM_MODULE(ipt_tos); -+INIT_KSYM_MODULE(ipt_TOS); -+INIT_KSYM_MODULE(ipt_REJECT); -+INIT_KSYM_MODULE(ipt_TCPMSS); -+INIT_KSYM_MODULE(xt_tcpmss); -+INIT_KSYM_MODULE(ipt_ttl); -+INIT_KSYM_MODULE(ipt_LOG); -+INIT_KSYM_MODULE(xt_length); -+INIT_KSYM_MODULE(ip_conntrack); -+INIT_KSYM_MODULE(ip_conntrack_ftp); -+INIT_KSYM_MODULE(ip_conntrack_irc); -+INIT_KSYM_MODULE(xt_conntrack); -+INIT_KSYM_MODULE(xt_state); -+INIT_KSYM_MODULE(xt_helper); -+INIT_KSYM_MODULE(ip_nat); -+INIT_KSYM_MODULE(iptable_nat); -+INIT_KSYM_MODULE(ip_nat_ftp); -+INIT_KSYM_MODULE(ip_nat_irc); -+INIT_KSYM_MODULE(ipt_REDIRECT); -+ -+INIT_KSYM_CALL(int, init_netfilter, (void)); -+INIT_KSYM_CALL(int, init_xtables, (void)); -+INIT_KSYM_CALL(int, init_xt_tcpudp, (void)); -+INIT_KSYM_CALL(int, init_iptables, (void)); -+INIT_KSYM_CALL(int, init_iptable_filter, (void)); -+INIT_KSYM_CALL(int, init_iptable_mangle, (void)); -+INIT_KSYM_CALL(int, init_xt_limit, (void)); -+INIT_KSYM_CALL(int, init_iptable_multiport, (void)); -+INIT_KSYM_CALL(int, init_iptable_tos, (void)); -+INIT_KSYM_CALL(int, init_iptable_TOS, (void)); -+INIT_KSYM_CALL(int, init_iptable_REJECT, (void)); -+INIT_KSYM_CALL(int, init_iptable_TCPMSS, (void)); -+INIT_KSYM_CALL(int, init_xt_tcpmss, (void)); -+INIT_KSYM_CALL(int, init_iptable_ttl, (void)); -+INIT_KSYM_CALL(int, init_iptable_LOG, (void)); -+INIT_KSYM_CALL(int, init_xt_length, (void)); -+INIT_KSYM_CALL(int, init_iptable_conntrack, (void)); -+INIT_KSYM_CALL(int, init_iptable_ftp, (void)); -+INIT_KSYM_CALL(int, init_iptable_irc, (void)); -+INIT_KSYM_CALL(int, init_xt_conntrack_match, (void)); -+INIT_KSYM_CALL(int, init_xt_state, (void)); -+INIT_KSYM_CALL(int, init_xt_helper, (void)); -+INIT_KSYM_CALL(int, ip_nat_init, (void)); -+INIT_KSYM_CALL(int, init_iptable_nat, (void)); -+INIT_KSYM_CALL(int, init_iptable_nat_ftp, (void)); -+INIT_KSYM_CALL(int, init_iptable_nat_irc, (void)); -+INIT_KSYM_CALL(int, init_iptable_REDIRECT, (void)); -+INIT_KSYM_CALL(void, fini_iptable_nat_irc, (void)); -+INIT_KSYM_CALL(void, fini_iptable_nat_ftp, (void)); -+INIT_KSYM_CALL(void, fini_iptable_nat, (void)); -+INIT_KSYM_CALL(void, ip_nat_cleanup, (void)); -+INIT_KSYM_CALL(void, fini_xt_helper, (void)); -+INIT_KSYM_CALL(void, fini_xt_state, (void)); -+INIT_KSYM_CALL(void, fini_xt_conntrack_match, (void)); -+INIT_KSYM_CALL(void, fini_iptable_irc, (void)); -+INIT_KSYM_CALL(void, fini_iptable_ftp, (void)); -+INIT_KSYM_CALL(void, fini_iptable_conntrack, (void)); -+INIT_KSYM_CALL(void, fini_xt_length, (void)); -+INIT_KSYM_CALL(void, fini_iptable_LOG, (void)); -+INIT_KSYM_CALL(void, fini_iptable_ttl, (void)); -+INIT_KSYM_CALL(void, fini_xt_tcpmss, (void)); -+INIT_KSYM_CALL(void, fini_iptable_TCPMSS, (void)); -+INIT_KSYM_CALL(void, fini_iptable_REJECT, (void)); -+INIT_KSYM_CALL(void, fini_iptable_TOS, (void)); -+INIT_KSYM_CALL(void, fini_iptable_tos, (void)); -+INIT_KSYM_CALL(void, fini_iptable_multiport, (void)); -+INIT_KSYM_CALL(void, fini_xt_limit, (void)); -+INIT_KSYM_CALL(void, fini_iptable_filter, (void)); -+INIT_KSYM_CALL(void, fini_iptable_mangle, (void)); -+INIT_KSYM_CALL(void, fini_iptables, (void)); -+INIT_KSYM_CALL(void, fini_xt_tcpudp, (void)); -+INIT_KSYM_CALL(void, fini_xtables, (void)); -+INIT_KSYM_CALL(void, fini_netfilter, (void)); -+INIT_KSYM_CALL(void, fini_iptable_REDIRECT, (void)); -+ -+INIT_KSYM_CALL(void, ipt_flush_table, (struct xt_table *table)); -+#endif -+ -+#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS) -+INIT_KSYM_MODULE(vzmon); -+INIT_KSYM_CALL(int, real_get_device_perms_ve, -+ (int dev_type, dev_t dev, int access_mode)); -+INIT_KSYM_CALL(void, real_do_env_cleanup, (struct ve_struct *env)); -+INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); -+INIT_KSYM_CALL(void, real_update_load_avg_ve, (void)); -+ -+int get_device_perms_ve(int dev_type, dev_t dev, int access_mode) -+{ -+ return KSYMSAFECALL(int, vzmon, real_get_device_perms_ve, -+ (dev_type, dev, access_mode)); -+} -+EXPORT_SYMBOL(get_device_perms_ve); -+ -+void do_env_cleanup(struct ve_struct *env) -+{ -+ KSYMSAFECALL_VOID(vzmon, real_do_env_cleanup, (env)); -+} -+ -+void do_env_free(struct ve_struct *env) -+{ -+ KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env)); -+} -+EXPORT_SYMBOL(do_env_free); -+ -+void do_update_load_avg_ve(void) -+{ -+ KSYMSAFECALL_VOID(vzmon, real_update_load_avg_ve, ()); -+} -+#endif -+ -+struct ve_struct ve0 = { -+ .utsname = &system_utsname, -+ .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh), -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ ._net_dev_tail = &ve0._net_dev_base, -+ .ifindex = -1, -+#endif -+#ifdef CONFIG_UNIX98_PTYS -+ .devpts_config = &devpts_config, -+#endif -+}; -+ -+EXPORT_SYMBOL(ve0); -+ -+#endif /* CONFIG_VE */ -diff -upr linux-2.6.16.orig/kernel/vecalls.c linux-2.6.16-026test009/kernel/vecalls.c ---- linux-2.6.16.orig/kernel/vecalls.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/vecalls.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,3275 @@ -+/* -+ * linux/kernel/vecalls.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ */ -+ -+/* -+ * 'vecalls.c' is file with basic VE support. It provides basic primities -+ * along with initialization script -+ */ -+ -+#include <linux/sched.h> -+#include <linux/delay.h> -+#include <linux/capability.h> -+#include <linux/ve.h> -+#include <linux/smp_lock.h> -+#include <linux/init.h> -+#include <linux/list.h> -+#include <linux/ve_owner.h> -+#include <linux/errno.h> -+#include <linux/unistd.h> -+#include <linux/slab.h> -+#include <linux/vmalloc.h> -+#include <linux/sys.h> -+#include <linux/fs.h> -+#include <linux/namespace.h> -+#include <linux/termios.h> -+#include <linux/tty_driver.h> -+#include <linux/netdevice.h> -+#include <linux/wait.h> -+#include <linux/inetdevice.h> -+#include <linux/utsname.h> -+#include <linux/sysctl.h> -+#include <linux/proc_fs.h> -+#include <linux/seq_file.h> -+#include <linux/kernel_stat.h> -+#include <linux/module.h> -+#include <linux/suspend.h> -+#include <linux/rcupdate.h> -+#include <linux/in.h> -+#include <linux/major.h> -+#include <linux/kdev_t.h> -+#include <linux/idr.h> -+#include <linux/inetdevice.h> -+#include <net/pkt_sched.h> -+#include <linux/divert.h> -+#include <ub/beancounter.h> -+ -+#include <net/route.h> -+#include <net/ip_fib.h> -+ -+#include <linux/ve_proto.h> -+#include <linux/venet.h> -+#include <linux/vzctl.h> -+#include <linux/vzcalluser.h> -+#ifdef CONFIG_FAIRSCHED -+#include <linux/fairsched.h> -+#endif -+ -+#include <linux/nfcalls.h> -+ -+struct ve_struct *ve_list_head = NULL; -+int nr_ve = 1; /* One VE always exists. Compatibility with vestat */ -+rwlock_t ve_list_guard = RW_LOCK_UNLOCKED; -+static rwlock_t devperms_hash_guard = RW_LOCK_UNLOCKED; -+ -+extern int glob_virt_pids; -+ -+static int do_env_enter(struct ve_struct *ve, unsigned int flags); -+static void do_clean_devperms(envid_t veid); -+static int alloc_ve_tty_drivers(struct ve_struct* ve); -+static void free_ve_tty_drivers(struct ve_struct* ve); -+static int register_ve_tty_drivers(struct ve_struct* ve); -+static void unregister_ve_tty_drivers(struct ve_struct* ve); -+static int init_ve_tty_drivers(struct ve_struct *); -+static void fini_ve_tty_drivers(struct ve_struct *); -+static void clear_termios(struct tty_driver* driver ); -+static void ve_mapped_devs_cleanup(struct ve_struct *ve); -+ -+static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat *buf); -+ -+static void vecalls_exit(void); -+ -+struct ve_struct *__find_ve_by_id(envid_t veid) -+{ -+ struct ve_struct *ve; -+ for (ve = ve_list_head; -+ ve != NULL && ve->veid != veid; -+ ve = ve->next); -+ return ve; -+} -+ -+struct ve_struct *get_ve_by_id(envid_t veid) -+{ -+ struct ve_struct *ve; -+ read_lock(&ve_list_guard); -+ ve = __find_ve_by_id(veid); -+ get_ve(ve); -+ read_unlock(&ve_list_guard); -+ return ve; -+} -+ -+/* -+ * real_put_ve() MUST be used instead of put_ve() inside vecalls. -+ */ -+void real_do_env_free(struct ve_struct *ve); -+static inline void real_put_ve(struct ve_struct *ve) -+{ -+ if (ve && atomic_dec_and_test(&ve->counter)) { -+ if (atomic_read(&ve->pcounter) > 0) -+ BUG(); -+ if (ve->is_running) -+ BUG(); -+ real_do_env_free(ve); -+ } -+} -+ -+extern struct file_system_type devpts_fs_type; -+extern struct file_system_type sysfs_fs_type; -+extern struct file_system_type tmpfs_fs_type; -+extern struct file_system_type proc_fs_type; -+ -+extern spinlock_t task_capability_lock; -+extern void ve_ipc_free(struct ve_struct * ve); -+extern void ip_fragment_cleanup(struct ve_struct *ve); -+ -+static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat *buf) -+{ -+ struct ve_struct *ve; -+ struct vz_cpu_stat *vstat; -+ int retval; -+ int i, cpu; -+ unsigned long tmp; -+ -+ if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid)) -+ return -EPERM; -+ if (veid == 0) -+ return -ESRCH; -+ -+ vstat = kmalloc(sizeof(*vstat), GFP_KERNEL); -+ if (!vstat) -+ return -ENOMEM; -+ memset(vstat, 0, sizeof(*vstat)); -+ -+ retval = -ESRCH; -+ read_lock(&ve_list_guard); -+ ve = __find_ve_by_id(veid); -+ if (ve == NULL) -+ goto out_unlock; -+ for (cpu = 0; cpu < NR_CPUS; cpu++) { -+ struct ve_cpu_stats *st; -+ -+ st = VE_CPU_STATS(ve, cpu); -+ vstat->user_jif += st->user; -+ vstat->nice_jif += st->nice; -+ vstat->system_jif += st->system; -+ vstat->idle_clk += ve_sched_get_idle_time(ve, cpu); -+ } -+ vstat->uptime_clk = get_cycles() - ve->start_cycles; -+ vstat->uptime_jif = jiffies - ve->start_jiffies; -+ for (i = 0; i < 3; i++) { -+ tmp = ve->avenrun[i] + (FIXED_1/200); -+ vstat->avenrun[i].val_int = LOAD_INT(tmp); -+ vstat->avenrun[i].val_frac = LOAD_FRAC(tmp); -+ } -+ read_unlock(&ve_list_guard); -+ -+ retval = 0; -+ if (copy_to_user(buf, vstat, sizeof(*vstat))) -+ retval = -EFAULT; -+out_free: -+ kfree(vstat); -+ return retval; -+ -+out_unlock: -+ read_unlock(&ve_list_guard); -+ goto out_free; -+} -+ -+/********************************************************************** -+ * Devices permissions routines, -+ * character and block devices separately -+ **********************************************************************/ -+ -+/* Rules applied in the following order: -+ MAJOR!=0, MINOR!=0 -+ MAJOR!=0, MINOR==0 -+ MAJOR==0, MINOR==0 -+*/ -+struct devperms_struct -+{ -+ dev_t dev; /* device id */ -+ unsigned char mask; -+ unsigned type; -+ envid_t veid; -+ -+ struct devperms_struct *devhash_next; -+ struct devperms_struct **devhash_pprev; -+}; -+ -+static struct devperms_struct original_perms[] = -+{{ -+ MKDEV(0,0), /*device*/ -+ S_IROTH | S_IWOTH, -+ S_IFCHR, /*type*/ -+ 0, /*veid*/ -+ NULL, NULL -+}, -+{ -+ MKDEV(0,0), /*device*/ -+ S_IXGRP | S_IROTH | S_IWOTH, -+ S_IFBLK, /*type*/ -+ 0, /*veid*/ -+ NULL, NULL -+}}; -+ -+static struct devperms_struct default_major_perms[] = { -+ {MKDEV(UNIX98_PTY_MASTER_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, -+ {MKDEV(UNIX98_PTY_SLAVE_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, -+ {MKDEV(PTY_MASTER_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, -+ {MKDEV(PTY_SLAVE_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, -+}; -+static struct devperms_struct default_minor_perms[] = { -+ {MKDEV(MEM_MAJOR, 3), S_IROTH | S_IWOTH, S_IFCHR}, /* null */ -+ {MKDEV(MEM_MAJOR, 5), S_IROTH | S_IWOTH, S_IFCHR}, /* zero */ -+ {MKDEV(MEM_MAJOR, 7), S_IROTH | S_IWOTH, S_IFCHR}, /* full */ -+ {MKDEV(TTYAUX_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR},/* tty */ -+ {MKDEV(TTYAUX_MAJOR, 2), S_IROTH | S_IWOTH, S_IFCHR},/* ptmx */ -+ {MKDEV(MEM_MAJOR, 8), S_IROTH, S_IFCHR}, /* random */ -+ {MKDEV(MEM_MAJOR, 9), S_IROTH, S_IFCHR}, /* urandom */ -+}; -+ -+static struct devperms_struct default_deny_perms = { -+ MKDEV(0, 0), 0, S_IFCHR -+}; -+ -+static inline struct devperms_struct *find_default_devperms(int type, -+ dev_t dev) -+{ -+ int i; -+ -+ /* XXX all defaults perms are S_IFCHR */ -+ if (type != S_IFCHR) -+ return &default_deny_perms; -+ -+ for (i = 0; -+ i < sizeof(default_minor_perms)/sizeof(struct devperms_struct); -+ i++) -+ if (MAJOR(dev) == MAJOR(default_minor_perms[i].dev) && -+ MINOR(dev) == MINOR(default_minor_perms[i].dev)) -+ return &default_minor_perms[i]; -+ for (i = 0; -+ i < sizeof(default_major_perms)/sizeof(struct devperms_struct); -+ i++) -+ if (MAJOR(dev) == MAJOR(default_major_perms[i].dev)) -+ return &default_major_perms[i]; -+ -+ return &default_deny_perms; -+} -+ -+#define DEVPERMS_HASH_SZ 512 -+struct devperms_struct *devperms_hash[DEVPERMS_HASH_SZ]; -+ -+#define devperms_hashfn(id,dev) \ -+ ( (id << 5) ^ (id >> 5) ^ (MAJOR(dev)) ^ MINOR(dev) ) & \ -+ (DEVPERMS_HASH_SZ - 1) -+ -+static inline void hash_devperms(struct devperms_struct *p) -+{ -+ struct devperms_struct **htable = -+ &devperms_hash[devperms_hashfn(p->veid,p->dev)]; -+ -+ if ((p->devhash_next = *htable) != NULL) -+ (*htable)->devhash_pprev = &p->devhash_next; -+ *htable = p; -+ p->devhash_pprev = htable; -+} -+ -+static inline void unhash_devperms(struct devperms_struct *p) -+{ -+ if (p->devhash_next) -+ p->devhash_next->devhash_pprev = p->devhash_pprev; -+ *p->devhash_pprev = p->devhash_next; -+} -+ -+static int __init init_devperms_hash(void) -+{ -+ write_lock_irq(&devperms_hash_guard); -+ memset(devperms_hash, 0, sizeof(devperms_hash)); -+ hash_devperms(original_perms); -+ hash_devperms(original_perms+1); -+ write_unlock_irq(&devperms_hash_guard); -+ return 0; -+} -+ -+static inline void fini_devperms_hash(void) -+{ -+} -+ -+static inline struct devperms_struct *find_devperms(envid_t veid, -+ int type, -+ dev_t dev) -+{ -+ struct devperms_struct *p, **htable = -+ &devperms_hash[devperms_hashfn(veid,dev)]; -+ -+ for (p = *htable; p && !(p->type==type && -+ MAJOR(dev)==MAJOR(p->dev) && -+ MINOR(dev)==MINOR(p->dev) && -+ p->veid==veid); -+ p = p->devhash_next) -+ ; -+ return p; -+} -+ -+ -+static void do_clean_devperms(envid_t veid) -+{ -+ int i; -+ struct devperms_struct* ve; -+ -+ write_lock_irq(&devperms_hash_guard); -+ for (i = 0; i < DEVPERMS_HASH_SZ; i++) -+ for (ve = devperms_hash[i]; ve;) { -+ struct devperms_struct *next = ve->devhash_next; -+ if (ve->veid == veid) { -+ unhash_devperms(ve); -+ kfree(ve); -+ } -+ -+ ve = next; -+ } -+ write_unlock_irq(&devperms_hash_guard); -+} -+ -+/* -+ * Mode is a mask of -+ * FMODE_READ for read access (configurable by S_IROTH) -+ * FMODE_WRITE for write access (configurable by S_IWOTH) -+ * FMODE_QUOTACTL for quotactl access (configurable by S_IXGRP) -+ */ -+int real_get_device_perms_ve(int dev_type, dev_t dev, int access_mode) -+{ -+ struct devperms_struct *perms; -+ struct ve_struct *ve; -+ envid_t veid; -+ -+ perms = NULL; -+ ve = get_exec_env(); -+ veid = ve->veid; -+ -+ read_lock(&devperms_hash_guard); -+ -+ perms = find_devperms(veid, dev_type|VE_USE_MINOR, dev); -+ if (perms) -+ goto end; -+ -+ perms = find_devperms(veid, dev_type|VE_USE_MAJOR, MKDEV(MAJOR(dev),0)); -+ if (perms) -+ goto end; -+ -+ perms = find_devperms(veid, dev_type, MKDEV(0,0)); -+ if (perms) -+ goto end; -+ -+ perms = find_default_devperms(dev_type, dev); -+ -+end: -+ read_unlock(&devperms_hash_guard); -+ -+ access_mode = "\000\004\002\006\010\014\012\016"[access_mode]; -+ return perms ? -+ (((perms->mask & access_mode) == access_mode) ? 0 : -EACCES) : -+ -ENODEV; -+} -+EXPORT_SYMBOL(real_get_device_perms_ve); -+ -+int do_setdevperms(envid_t veid, unsigned type, dev_t dev, unsigned mask) -+{ -+ struct devperms_struct *perms; -+ -+ write_lock_irq(&devperms_hash_guard); -+ perms = find_devperms(veid, type, dev); -+ if (!perms) { -+ struct devperms_struct *perms_new; -+ write_unlock_irq(&devperms_hash_guard); -+ -+ perms_new = kmalloc(sizeof(struct devperms_struct), GFP_KERNEL); -+ if (!perms_new) -+ return -ENOMEM; -+ -+ write_lock_irq(&devperms_hash_guard); -+ perms = find_devperms(veid, type, dev); -+ if (perms) { -+ kfree(perms_new); -+ perms_new = perms; -+ } -+ -+ switch (type & VE_USE_MASK) { -+ case 0: -+ dev = 0; -+ break; -+ case VE_USE_MAJOR: -+ dev = MKDEV(MAJOR(dev),0); -+ break; -+ } -+ -+ perms_new->veid = veid; -+ perms_new->dev = dev; -+ perms_new->type = type; -+ perms_new->mask = mask & S_IALLUGO; -+ hash_devperms(perms_new); -+ } else -+ perms->mask = mask & S_IALLUGO; -+ write_unlock_irq(&devperms_hash_guard); -+ return 0; -+} -+EXPORT_SYMBOL(do_setdevperms); -+ -+int real_setdevperms(envid_t veid, unsigned type, dev_t dev, unsigned mask) -+{ -+ struct ve_struct *ve; -+ int err; -+ -+ if (!capable(CAP_SETVEID) || veid == 0) -+ return -EPERM; -+ -+ if ((ve = get_ve_by_id(veid)) == NULL) -+ return -ESRCH; -+ -+ down_read(&ve->op_sem); -+ err = -ESRCH; -+ if (ve->is_running) -+ err = do_setdevperms(veid, type, dev, mask); -+ up_read(&ve->op_sem); -+ real_put_ve(ve); -+ return err; -+} -+ -+void real_update_load_avg_ve(void) -+{ -+ struct ve_struct *ve; -+ unsigned long nr_active; -+ -+ read_lock(&ve_list_guard); -+ for (ve = ve_list_head; ve != NULL; ve = ve->next) { -+ nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve); -+ nr_active *= FIXED_1; -+ CALC_LOAD(ve->avenrun[0], EXP_1, nr_active); -+ CALC_LOAD(ve->avenrun[1], EXP_5, nr_active); -+ CALC_LOAD(ve->avenrun[2], EXP_15, nr_active); -+ } -+ read_unlock(&ve_list_guard); -+} -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * FS-related helpers to VE start/stop -+ * -+ ********************************************************************** -+ **********************************************************************/ -+ -+/* -+ * DEVPTS needs a virtualization: each environment should see each own list of -+ * pseudo-terminals. -+ * To implement it we need to have separate devpts superblocks for each -+ * VE, and each VE should mount its own one. -+ * Thus, separate vfsmount structures are required. -+ * To minimize intrusion into vfsmount lookup code, separate file_system_type -+ * structures are created. -+ * -+ * In addition to this, patch fo character device itself is required, as file -+ * system itself is used only for MINOR/MAJOR lookup. -+ */ -+static int register_ve_fs_type(struct ve_struct *ve, -+ struct file_system_type *template, -+ struct file_system_type **p_fs_type, struct vfsmount **p_mnt) -+{ -+ struct vfsmount *mnt; -+ struct file_system_type *local_fs_type; -+ int ret; -+ -+ VZTRACE("register_ve_fs_type(\"%s\")\n", template->name); -+ -+ local_fs_type = kmalloc(sizeof(*local_fs_type) + sizeof(void *), -+ GFP_KERNEL); -+ if (local_fs_type == NULL) -+ return -ENOMEM; -+ -+ memset(local_fs_type, 0, sizeof(*local_fs_type)); -+ local_fs_type->name = template->name; -+ local_fs_type->fs_flags = template->fs_flags; -+ local_fs_type->get_sb = template->get_sb; -+ local_fs_type->kill_sb = template->kill_sb; -+ local_fs_type->owner = template->owner; -+ /* -+ * 1. we do not have refcounter on fstype -+ * 2. fstype holds reference to ve using get_ve()/put_ve(). -+ * so we free fstype when freeing ve and we are sure it's ok to free it -+ */ -+ SET_VE_OWNER_FSTYPE(local_fs_type, ve); -+ get_filesystem(local_fs_type); /* get_ve() inside */ -+ -+ ret = register_filesystem(local_fs_type); /* does not get */ -+ if (ret) -+ goto reg_err; -+ -+ mnt = kern_mount(local_fs_type); -+ if (IS_ERR(mnt)) -+ goto mnt_err; -+ -+ /* Usage counters after succesful execution kern_mount: -+ * local_fs_type - +1 (get_fs_type,get_sb_single,put_filesystem) -+ * mnt - +1 == 1 (alloc_vfsmnt) -+ */ -+ -+ *p_fs_type = local_fs_type; -+ *p_mnt = mnt; -+ return 0; -+ -+mnt_err: -+ ret = PTR_ERR(mnt); -+ unregister_filesystem(local_fs_type); /* does not put */ -+ -+reg_err: -+ put_filesystem(local_fs_type); -+ kfree(local_fs_type); -+ printk(KERN_DEBUG -+ "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret); -+ return ret; -+} -+ -+static void umount_ve_fs_type(struct file_system_type *local_fs_type) -+{ -+ struct vfsmount *mnt; -+ struct list_head *p, *q; -+ LIST_HEAD(kill); -+ LIST_HEAD(umount_list); -+ -+ down_write(&namespace_sem); -+ spin_lock(&vfsmount_lock); -+ list_for_each_safe(p, q, ¤t->namespace->list) { -+ mnt = list_entry(p, struct vfsmount, mnt_list); -+ if (mnt->mnt_sb->s_type != local_fs_type) -+ continue; -+ list_del(p); -+ list_add(p, &kill); -+ } -+ -+ while (!list_empty(&kill)) { -+ mnt = list_entry(kill.next, struct vfsmount, mnt_list); -+ umount_tree(mnt, 1, &umount_list); -+ } -+ spin_unlock(&vfsmount_lock); -+ up_write(&namespace_sem); -+ release_mounts(&umount_list); -+} -+ -+static void unregister_ve_fs_type(struct file_system_type *local_fs_type, -+ struct vfsmount *local_fs_mount) -+{ -+ if (local_fs_mount == NULL || -+ local_fs_type == NULL) { -+ if (local_fs_mount != NULL || -+ local_fs_type != NULL) -+ BUG(); -+ return; -+ } -+ -+ VZTRACE("unregister_ve_fs_type(\"%s\")\n", local_fs_type->name); -+ -+ unregister_filesystem(local_fs_type); -+ umount_ve_fs_type(local_fs_type); -+ kern_umount(local_fs_mount); /* alias to mntput, drop our ref */ -+ put_filesystem(local_fs_type); -+} -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * FS-related helpers to VE start/stop -+ * -+ ********************************************************************** -+ **********************************************************************/ -+ -+#ifdef CONFIG_SYSCTL -+static ctl_table ve_sysctl_tables[] = { -+ /* kernel */ -+ { -+ .ctl_name = CTL_KERN, -+ .procname = "kernel", -+ .mode = 0555, -+ .child = &ve_sysctl_tables[2], -+ }, -+ { .ctl_name = 0 }, -+ /* kernel/[vars] */ -+ { -+ .ctl_name = KERN_NODENAME, -+ .procname = "hostname", -+ .maxlen = 64, -+ .mode = 0644, -+ .proc_handler = &proc_doutsstring, -+ .strategy = &sysctl_string, -+ }, -+ { -+ .ctl_name = KERN_DOMAINNAME, -+ .procname = "domainname", -+ .maxlen = 64, -+ .mode = 0644, -+ .proc_handler = &proc_doutsstring, -+ .strategy = &sysctl_string, -+ }, -+ { -+ .ctl_name = KERN_SHMMAX, -+ .procname = "shmmax", -+ .maxlen = sizeof(size_t), -+ .mode = 0644, -+ .proc_handler = &proc_doulongvec_minmax, -+ }, -+ { -+ .ctl_name = KERN_SHMALL, -+ .procname = "shmall", -+ .maxlen = sizeof(size_t), -+ .mode = 0644, -+ .proc_handler = &proc_doulongvec_minmax, -+ }, -+ { -+ .ctl_name = KERN_SHMMNI, -+ .procname = "shmmni", -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = KERN_MSGMAX, -+ .procname = "msgmax", -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = KERN_MSGMNI, -+ .procname = "msgmni", -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = KERN_MSGMNB, -+ .procname = "msgmnb", -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = KERN_SEM, -+ .procname = "sem", -+ .maxlen = 4 * sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec -+ }, -+ { .ctl_name = 0, } -+}; -+ -+static int register_ve_sysctltables(struct ve_struct *ve) -+{ -+ struct ctl_table_header *header; -+ ctl_table *root, *table; -+ -+ VZTRACE("register_ve_sysctltables\n"); -+ -+ root = clone_sysctl_template(ve_sysctl_tables, -+ sizeof(ve_sysctl_tables) / sizeof(ctl_table)); -+ if (root == NULL) -+ goto out; -+ -+ table = root->child; -+ table[0].data = &ve->utsname->nodename; -+ table[1].data = &ve->utsname->domainname; -+ table[2].data = &ve->_shm_ctlmax; -+ table[3].data = &ve->_shm_ctlall; -+ table[4].data = &ve->_shm_ctlmni; -+ table[5].data = &ve->_msg_ctlmax; -+ table[6].data = &ve->_msg_ctlmni; -+ table[7].data = &ve->_msg_ctlmnb; -+ table[8].data = &ve->_sem_ctls[0]; -+ -+ /* insert at head to override kern entries */ -+ header = register_sysctl_table(root, 1); -+ if (header == NULL) -+ goto out_free; -+ -+ ve->kern_header = header; -+ ve->kern_table = root; -+ return 0; -+ -+out_free: -+ free_sysctl_clone(root); -+out: -+ return -ENOMEM; -+} -+ -+static inline void unregister_ve_sysctltables(struct ve_struct *ve) -+{ -+ unregister_sysctl_table(ve->kern_header); -+} -+ -+static inline void free_ve_sysctltables(struct ve_struct *ve) -+{ -+ free_sysctl_clone(ve->kern_table); -+} -+#endif -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * VE start: subsystems -+ * -+ ********************************************************************** -+ **********************************************************************/ -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#include <net/ip.h> -+#include <net/tcp.h> -+#include <net/udp.h> -+#include <net/icmp.h> -+ -+extern struct new_utsname virt_utsname; -+ -+static int init_ve_utsname(struct ve_struct *ve) -+{ -+ ve->utsname = kmalloc(sizeof(*ve->utsname), GFP_KERNEL); -+ if (ve->utsname == NULL) -+ return -ENOMEM; -+ -+ down_read(&uts_sem); /* protect the source */ -+ memcpy(ve->utsname, &system_utsname, sizeof(*ve->utsname)); -+ memcpy(ve->utsname->release, virt_utsname.release, -+ sizeof(virt_utsname.release)); -+ up_read(&uts_sem); -+ -+ return 0; -+} -+ -+static void free_ve_utsname(struct ve_struct *ve) -+{ -+ kfree(ve->utsname); -+ ve->utsname = NULL; -+} -+ -+static int init_fini_ve_mibs(struct ve_struct *ve, int fini) -+{ -+ if (fini) -+ goto fini; -+ if (!(ve->_net_statistics[0] = alloc_percpu(struct linux_mib))) -+ goto out1; -+ if (!(ve->_net_statistics[1] = alloc_percpu(struct linux_mib))) -+ goto out2; -+ if (!(ve->_ip_statistics[0] = alloc_percpu(struct ipstats_mib))) -+ goto out3; -+ if (!(ve->_ip_statistics[1] = alloc_percpu(struct ipstats_mib))) -+ goto out4; -+ if (!(ve->_icmp_statistics[0] = alloc_percpu(struct icmp_mib))) -+ goto out5; -+ if (!(ve->_icmp_statistics[1] = alloc_percpu(struct icmp_mib))) -+ goto out6; -+ if (!(ve->_tcp_statistics[0] = alloc_percpu(struct tcp_mib))) -+ goto out7; -+ if (!(ve->_tcp_statistics[1] = alloc_percpu(struct tcp_mib))) -+ goto out8; -+ if (!(ve->_udp_statistics[0] = alloc_percpu(struct udp_mib))) -+ goto out9; -+ if (!(ve->_udp_statistics[1] = alloc_percpu(struct udp_mib))) -+ goto out10; -+ return 0; -+fini: -+ free_percpu(ve->_udp_statistics[1]); -+out10: -+ free_percpu(ve->_udp_statistics[0]); -+out9: -+ free_percpu(ve->_tcp_statistics[1]); -+out8: -+ free_percpu(ve->_tcp_statistics[0]); -+out7: -+ free_percpu(ve->_icmp_statistics[1]); -+out6: -+ free_percpu(ve->_icmp_statistics[0]); -+out5: -+ free_percpu(ve->_ip_statistics[1]); -+out4: -+ free_percpu(ve->_ip_statistics[0]); -+out3: -+ free_percpu(ve->_net_statistics[1]); -+out2: -+ free_percpu(ve->_net_statistics[0]); -+out1: -+ return -ENOMEM; -+} -+ -+static inline int init_ve_mibs(struct ve_struct *ve) -+{ -+ return init_fini_ve_mibs(ve, 0); -+} -+ -+static inline void fini_ve_mibs(struct ve_struct *ve) -+{ -+ (void)init_fini_ve_mibs(ve, 1); -+} -+ -+extern struct net_device templ_loopback_dev; -+static void veloop_setup(struct net_device *dev) -+{ -+ int padded; -+ padded = dev->padded; -+ memcpy(dev, &templ_loopback_dev, sizeof(struct net_device)); -+ dev->padded = padded; -+} -+ -+static int init_ve_netdev(void) -+{ -+ struct ve_struct *ve; -+ struct net_device_stats *stats; -+ int err; -+ -+ ve = get_exec_env(); -+ INIT_HLIST_HEAD(&ve->_net_dev_head); -+ ve->_net_dev_base = NULL; -+ ve->_net_dev_tail = &ve->_net_dev_base; -+ -+ ve->_loopback_dev = alloc_netdev(0, templ_loopback_dev.name, -+ veloop_setup); -+ if (ve->_loopback_dev == NULL) -+ return -ENOMEM; -+ if (loopback_dev.get_stats != NULL) { -+ stats = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); -+ if (stats != NULL) { -+ memset(stats, 0, sizeof(struct net_device_stats)); -+ ve->_loopback_dev->priv = stats; -+ ve->_loopback_dev->get_stats = loopback_dev.get_stats; -+ ve->_loopback_dev->destructor = loopback_dev.destructor; -+ } -+ } -+ err = register_netdev(ve->_loopback_dev); -+ if (err) { -+ if (ve->_loopback_dev->priv != NULL) -+ kfree(ve->_loopback_dev->priv); -+ free_netdev(ve->_loopback_dev); -+ } -+ return err; -+} -+ -+static void fini_ve_netdev(void) -+{ -+ struct ve_struct *ve; -+ struct net_device *dev; -+ -+ ve = get_exec_env(); -+ while (1) { -+ rtnl_lock(); -+ /* -+ * loopback is special, it can be referenced in fib's, -+ * so it must be freed the last. Doing so is -+ * sufficient to guarantee absence of such references. -+ */ -+ if (dev_base == ve->_loopback_dev) -+ dev = dev_base->next; -+ else -+ dev = dev_base; -+ if (dev == NULL) -+ break; -+ unregister_netdevice(dev); -+ rtnl_unlock(); -+ free_netdev(dev); -+ } -+ unregister_netdevice(ve->_loopback_dev); -+ rtnl_unlock(); -+ free_netdev(ve->_loopback_dev); -+ ve->_loopback_dev = NULL; -+} -+#else -+#define init_ve_mibs(ve) (0) -+#define fini_ve_mibs(ve) do { } while (0) -+#define init_ve_netdev() (0) -+#define fini_ve_netdev() do { } while (0) -+#endif -+ -+static int prepare_proc_root(struct ve_struct *ve) -+{ -+ struct proc_dir_entry *de; -+ -+ de = kmalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL); -+ if (de == NULL) -+ return -ENOMEM; -+ memset(de, 0, sizeof(struct proc_dir_entry)); -+ memcpy(de + 1, "/proc", 6); -+ de->name = (char *)(de + 1); -+ de->namelen = 5; -+ de->mode = S_IFDIR | S_IRUGO | S_IXUGO; -+ de->nlink = 2; -+ atomic_set(&de->count, 1); -+ -+ ve->proc_root = de; -+ return 0; -+} -+ -+#ifdef CONFIG_PROC_FS -+static int init_ve_proc(struct ve_struct *ve) -+{ -+ int err; -+ struct proc_dir_entry *de; -+ -+ err = prepare_proc_root(ve); -+ if (err) -+ goto out_root; -+ -+ err = register_ve_fs_type(ve, &proc_fs_type, -+ &ve->proc_fstype, &ve->proc_mnt); -+ if (err) -+ goto out_reg; -+ -+ /* create necessary /proc subdirs in VE local proc tree */ -+ err = -ENOMEM; -+ de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); -+ if (!de) -+ goto out_vz; -+ -+#ifdef CONFIG_VE_IPTABLES -+ proc_net = proc_mkdir("net", NULL); -+ if (!proc_net) -+ goto out_net; -+#endif -+ -+ return 0; -+ -+#ifdef CONFIG_VE_IPTABLES -+out_net: -+ remove_proc_entry("vz", NULL); -+#endif -+out_vz: -+ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); -+ ve->proc_mnt = NULL; -+out_reg: -+ /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */ -+ ; -+out_root: -+ return err; -+} -+ -+static void fini_ve_proc(struct ve_struct *ve) -+{ -+#ifdef CONFIG_VE_IPTABLES -+ remove_proc_entry("net", NULL); -+ proc_net = NULL; -+#endif -+ remove_proc_entry("vz", NULL); -+ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); -+ ve->proc_mnt = NULL; -+} -+ -+static void free_ve_proc(struct ve_struct *ve) -+{ -+ /* proc filesystem frees proc_dir_entries on remove_proc_entry() only, -+ so we check that everything was removed and not lost */ -+ if (ve->proc_root && ve->proc_root->subdir) { -+ struct proc_dir_entry *p = ve->proc_root; -+ printk(KERN_WARNING "VPS: %d: proc entry /proc", ve->veid); -+ while ((p = p->subdir) != NULL) -+ printk("/%s", p->name); -+ printk(" is not removed!\n"); -+ } -+ -+ kfree(ve->proc_root); -+ kfree(ve->proc_fstype); -+ -+ ve->proc_fstype = NULL; -+ ve->proc_root = NULL; -+} -+#else -+#define init_ve_proc(ve) (0) -+#define fini_ve_proc(ve) do { } while (0) -+#define free_ve_proc(ve) do { } while (0) -+#endif -+ -+#ifdef CONFIG_SYSCTL -+static int init_ve_sysctl(struct ve_struct *ve) -+{ -+ int err; -+ -+#ifdef CONFIG_PROC_FS -+ err = -ENOMEM; -+ ve->proc_sys_root = proc_mkdir("sys", 0); -+ if (ve->proc_sys_root == NULL) -+ goto out_proc; -+#endif -+ INIT_LIST_HEAD(&ve->sysctl_lh); -+ err = register_ve_sysctltables(ve); -+ if (err) -+ goto out_reg; -+ -+ err = devinet_sysctl_init(ve); -+ if (err) -+ goto out_dev; -+ -+ return 0; -+ -+out_dev: -+ unregister_ve_sysctltables(ve); -+ free_ve_sysctltables(ve); -+out_reg: -+#ifdef CONFIG_PROC_FS -+ remove_proc_entry("sys", NULL); -+out_proc: -+#endif -+ return err; -+} -+ -+static void fini_ve_sysctl(struct ve_struct *ve) -+{ -+ devinet_sysctl_fini(ve); -+ unregister_ve_sysctltables(ve); -+ remove_proc_entry("sys", NULL); -+} -+ -+static void free_ve_sysctl(struct ve_struct *ve) -+{ -+ devinet_sysctl_free(ve); -+ free_ve_sysctltables(ve); -+} -+#else -+#define init_ve_sysctl(ve) (0) -+#define fini_ve_sysctl(ve) do { } while (0) -+#define free_ve_sysctl(ve) do { } while (0) -+#endif -+ -+#ifdef CONFIG_UNIX98_PTYS -+#include <linux/devpts_fs.h> -+ -+static int init_ve_devpts(struct ve_struct *ve) -+{ -+ int err; -+ -+ err = -ENOMEM; -+ ve->devpts_config = kmalloc(sizeof(struct devpts_config), GFP_KERNEL); -+ if (ve->devpts_config == NULL) -+ goto out; -+ memset(ve->devpts_config, 0, sizeof(struct devpts_config)); -+ ve->devpts_config->mode = 0600; -+ err = register_ve_fs_type(ve, &devpts_fs_type, -+ &ve->devpts_fstype, &ve->devpts_mnt); -+ if (err) { -+ kfree(ve->devpts_config); -+ ve->devpts_config = NULL; -+ } -+out: -+ return err; -+} -+ -+static void fini_ve_devpts(struct ve_struct *ve) -+{ -+ unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt); -+ /* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */ -+ ve->devpts_mnt = NULL; -+ kfree(ve->devpts_config); -+ ve->devpts_config = NULL; -+} -+#else -+#define init_ve_devpts(ve) (0) -+#define fini_ve_devpts(ve) do { } while (0) -+#endif -+ -+static int init_ve_shmem(struct ve_struct *ve) -+{ -+ return register_ve_fs_type(ve, -+ &tmpfs_fs_type, -+ &ve->shmem_fstype, -+ &ve->shmem_mnt); -+} -+ -+static void fini_ve_shmem(struct ve_struct *ve) -+{ -+ unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt); -+ /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */ -+ ve->shmem_mnt = NULL; -+} -+ -+static inline int init_ve_sysfs_root(struct ve_struct *ve) -+{ -+ struct sysfs_dirent *sysfs_root; -+ -+ sysfs_root = kmalloc(sizeof(struct sysfs_dirent), GFP_KERNEL); -+ if (sysfs_root == NULL) -+ return -ENOMEM; -+ -+ memset(sysfs_root, 0, sizeof(struct sysfs_dirent)); -+ INIT_LIST_HEAD(&sysfs_root->s_sibling); -+ INIT_LIST_HEAD(&sysfs_root->s_children); -+ sysfs_root->s_type = SYSFS_ROOT; -+ ve->sysfs_root = sysfs_root; -+ return 0; -+} -+ -+static int init_ve_sysfs(struct ve_struct *ve) -+{ -+ struct subsystem *subsys; -+ struct class *nc; -+ int err; -+ extern struct subsystem class_obj_subsys; -+ extern struct subsystem class_subsys; -+ extern struct class net_class; -+ -+#ifdef CONFIG_SYSFS -+ err = 0; -+ if (ve->features & VE_FEATURE_SYSFS) { -+ err = init_ve_sysfs_root(ve); -+ if (err != 0) -+ goto out; -+ err = register_ve_fs_type(ve, -+ &sysfs_fs_type, -+ &ve->sysfs_fstype, -+ &ve->sysfs_mnt); -+ } -+ if (err != 0) -+ goto out_fs_type; -+#endif -+ err = -ENOMEM; -+ subsys = kmalloc(sizeof(*subsys), GFP_KERNEL); -+ if (subsys == NULL) -+ goto out_class_obj; -+ /* ick, this is ugly, the things we go through to keep from showing up -+ * in sysfs... */ -+ memset(subsys, 0, sizeof(*subsys)); -+ memcpy(&subsys->kset.kobj.name, &class_obj_subsys.kset.kobj.name, -+ sizeof(subsys->kset.kobj.name)); -+ subsys->kset.ktype = class_obj_subsys.kset.ktype; -+ subsys->kset.uevent_ops = class_obj_subsys.kset.uevent_ops; -+ subsystem_init(subsys); -+ if (!subsys->kset.subsys) -+ subsys->kset.subsys = subsys; -+ ve->class_obj_subsys = subsys; -+ -+ err = -ENOMEM; -+ subsys = kmalloc(sizeof(*subsys), GFP_KERNEL); -+ if (subsys == NULL) -+ goto out_class_subsys; -+ /* ick, this is ugly, the things we go through to keep from showing up -+ * in sysfs... */ -+ memset(subsys, 0, sizeof(*subsys)); -+ memcpy(&subsys->kset.kobj.name, &class_subsys.kset.kobj.name, -+ sizeof(subsys->kset.kobj.name)); -+ subsys->kset.ktype = class_subsys.kset.ktype; -+ subsys->kset.uevent_ops = class_subsys.kset.uevent_ops; -+ ve->class_subsys = subsys; -+ err = subsystem_register(subsys); -+ if (err != 0) -+ goto out_register; -+ -+ err = -ENOMEM; -+ nc = kmalloc(sizeof(*nc), GFP_KERNEL); -+ if (nc == NULL) -+ goto out_nc; -+ memset(nc, 0, sizeof(*nc)); -+ nc->name = net_class.name; -+ nc->release = net_class.release; -+ nc->uevent = net_class.uevent; -+ err = class_register(nc); -+ if (err != 0) -+ goto out_class_register; -+ ve->net_class = nc; -+ -+ return err; -+ -+out_class_register: -+ kfree(nc); -+out_nc: -+ subsystem_unregister(subsys); -+out_register: -+ kfree(ve->class_subsys); -+out_class_subsys: -+ kfree(ve->class_obj_subsys); -+out_class_obj: -+#ifdef CONFIG_SYSFS -+ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); -+ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ -+out_fs_type: -+ kfree(ve->sysfs_root); -+ ve->sysfs_root = NULL; -+#endif -+ ve->class_subsys = NULL; -+ ve->class_obj_subsys = NULL; -+out: -+ return err; -+} -+ -+static void fini_ve_sysfs(struct ve_struct *ve) -+{ -+ class_unregister(ve->net_class); -+ subsystem_unregister(ve->class_subsys); -+ -+ kfree(ve->net_class); -+ kfree(ve->class_subsys); -+ kfree(ve->class_obj_subsys); -+ -+ ve->net_class = NULL; -+ ve->class_subsys = NULL; -+ ve->class_obj_subsys = NULL; -+#ifdef CONFIG_SYSFS -+ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); -+ ve->sysfs_mnt = NULL; -+ kfree(ve->sysfs_root); -+ ve->sysfs_root = NULL; -+ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ -+#endif -+} -+ -+static void free_ve_filesystems(struct ve_struct *ve) -+{ -+#ifdef CONFIG_SYSFS -+ kfree(ve->sysfs_fstype); -+ ve->sysfs_fstype = NULL; -+#endif -+ kfree(ve->shmem_fstype); -+ ve->shmem_fstype = NULL; -+ -+ kfree(ve->devpts_fstype); -+ ve->devpts_fstype = NULL; -+ -+ free_ve_proc(ve); -+} -+ -+static int init_printk(struct ve_struct *ve) -+{ -+ struct ve_prep_printk { -+ wait_queue_head_t log_wait; -+ unsigned long log_start; -+ unsigned long log_end; -+ unsigned long logged_chars; -+ } *tmp; -+ -+ tmp = kmalloc(sizeof(struct ve_prep_printk), GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ memset(tmp, 0, sizeof(struct ve_prep_printk)); -+ init_waitqueue_head(&tmp->log_wait); -+ ve->_log_wait = &tmp->log_wait; -+ ve->_log_start = &tmp->log_start; -+ ve->_log_end = &tmp->log_end; -+ ve->_logged_chars = &tmp->logged_chars; -+ /* ve->log_buf will be initialized later by ve_log_init() */ -+ return 0; -+} -+ -+static void fini_printk(struct ve_struct *ve) -+{ -+ /* -+ * there is no spinlock protection here because nobody can use -+ * log_buf at the moments when this code is called. -+ */ -+ kfree(ve->log_buf); -+ kfree(ve->_log_wait); -+} -+ -+static void fini_venet(struct ve_struct *ve) -+{ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ tcp_v4_kill_ve_sockets(ve); -+#endif -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ ve_mapped_devs_cleanup(ve); -+#endif -+} -+ -+static int init_ve_sched(struct ve_struct *ve) -+{ -+#ifdef CONFIG_FAIRSCHED -+ int err; -+ -+ /* -+ * We refuse to switch to an already existing node since nodes -+ * keep a pointer to their ve_struct... -+ */ -+ err = sys_fairsched_mknod(0, 1, ve->veid); -+ if (err < 0) { -+ printk(KERN_WARNING "Can't create fairsched node %d\n", -+ ve->veid); -+ return err; -+ } -+ err = sys_fairsched_mvpr(current->pid, ve->veid); -+ if (err) { -+ printk(KERN_WARNING "Can't switch to fairsched node %d\n", -+ ve->veid); -+ if (sys_fairsched_rmnod(ve->veid)) -+ printk(KERN_ERR "Can't clean fairsched node %d\n", -+ ve->veid); -+ return err; -+ } -+#endif -+ ve_sched_attach(ve); -+ return 0; -+} -+ -+static void fini_ve_sched(struct ve_struct *ve) -+{ -+#ifdef CONFIG_FAIRSCHED -+ if (task_vsched_id(current) == ve->veid) -+ if (sys_fairsched_mvpr(current->pid, fairsched_init_node.id)) -+ printk(KERN_WARNING "Can't leave fairsched node %d\n", -+ ve->veid); -+ if (sys_fairsched_rmnod(ve->veid)) -+ printk(KERN_ERR "Can't remove fairsched node %d\n", -+ ve->veid); -+#endif -+} -+ -+static int init_ve_struct(struct ve_struct *ve, envid_t veid, -+ u32 class_id, env_create_param_t *data, -+ struct task_struct *init_tsk) -+{ -+ int n; -+ -+ memset(ve, 0, sizeof(*ve)); -+ (void)get_ve(ve); -+ ve->veid = veid; -+ ve->class_id = class_id; -+ ve->init_entry = init_tsk; -+ ve->features = data->feature_mask; -+ INIT_LIST_HEAD(&ve->vetask_lh); -+ init_rwsem(&ve->op_sem); -+ ve->ifindex = -1; -+ -+ for(n = 0; n < UIDHASH_SZ_VE; ++n) -+ INIT_LIST_HEAD(&ve->uidhash_table[n]); -+ -+ do_posix_clock_monotonic_gettime(&ve->start_timespec); -+ ve->start_jiffies = jiffies; -+ ve->start_cycles = get_cycles(); -+ ve->virt_pids = glob_virt_pids; -+ -+ return 0; -+} -+ -+static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk) -+{ -+ read_lock(&tsk->fs->lock); -+ ve->fs_rootmnt = tsk->fs->rootmnt; -+ ve->fs_root = tsk->fs->root; -+ read_unlock(&tsk->fs->lock); -+ mark_tree_virtual(ve->fs_rootmnt, ve->fs_root); -+} -+ -+static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk) -+{ -+ /* required for real_setdevperms from register_ve_<fs> above */ -+ memcpy(&ve->cap_default, &tsk->cap_effective, sizeof(kernel_cap_t)); -+ cap_lower(ve->cap_default, CAP_SETVEID); -+} -+ -+static int ve_list_add(struct ve_struct *ve) -+{ -+ write_lock_irq(&ve_list_guard); -+ if (__find_ve_by_id(ve->veid) != NULL) -+ goto err_exists; -+ -+ ve->prev = NULL; -+ ve->next = ve_list_head; -+ if (ve_list_head) -+ ve_list_head->prev = ve; -+ ve_list_head = ve; -+ nr_ve++; -+ write_unlock_irq(&ve_list_guard); -+ return 0; -+ -+err_exists: -+ write_unlock_irq(&ve_list_guard); -+ return -EEXIST; -+} -+ -+static void ve_list_del(struct ve_struct *ve) -+{ -+ write_lock_irq(&ve_list_guard); -+ if (ve->prev) -+ ve->prev->next = ve->next; -+ else -+ ve_list_head = ve->next; -+ if (ve->next) -+ ve->next->prev = ve->prev; -+ nr_ve--; -+ write_unlock_irq(&ve_list_guard); -+} -+ -+static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve) -+{ -+ spin_lock(&task_capability_lock); -+ cap_mask(tsk->cap_effective, ve->cap_default); -+ cap_mask(tsk->cap_inheritable, ve->cap_default); -+ cap_mask(tsk->cap_permitted, ve->cap_default); -+ spin_unlock(&task_capability_lock); -+} -+ -+static void move_task(struct task_struct *tsk, struct ve_struct *new, -+ struct ve_struct *old) -+{ -+ /* this probihibts ptracing of task entered to VPS from host system */ -+ tsk->mm->vps_dumpable = 0; -+ /* setup capabilities before enter */ -+ set_task_ve_caps(tsk, new); -+ -+ write_lock_irq(&tasklist_lock); -+ VE_TASK_INFO(tsk)->owner_env = new; -+ VE_TASK_INFO(tsk)->exec_env = new; -+ REMOVE_VE_LINKS(tsk); -+ SET_VE_LINKS(tsk); -+ -+ atomic_dec(&old->pcounter); -+ atomic_inc(&new->pcounter); -+ real_put_ve(old); -+ get_ve(new); -+ write_unlock_irq(&tasklist_lock); -+} -+ -+#if (defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)) && \ -+ defined(CONFIG_NETFILTER) && defined(CONFIG_VE_IPTABLES) -+extern int init_netfilter(void); -+extern void fini_netfilter(void); -+#define init_ve_netfilter() init_netfilter() -+#define fini_ve_netfilter() fini_netfilter() -+#else -+#define init_ve_netfilter() (0) -+#define fini_ve_netfilter() do { } while (0) -+#endif -+ -+#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args) \ -+({ \ -+ int ret = 0; \ -+ if (VE_IPT_CMP(mask, full_mask) && \ -+ VE_IPT_CMP((ve)->_iptables_modules, \ -+ full_mask & ~(full_mask##_MOD))) { \ -+ ret = KSYMERRCALL(1, mod, name, args); \ -+ if (ret == 0) \ -+ (ve)->_iptables_modules |= \ -+ full_mask##_MOD; \ -+ if (ret == 1) \ -+ ret = 0; \ -+ } \ -+ ret; \ -+}) -+ -+#define KSYMIPTFINI(mask, full_mask, mod, name, args) \ -+({ \ -+ if (VE_IPT_CMP(mask, full_mask##_MOD)) \ -+ KSYMSAFECALL_VOID(mod, name, args); \ -+}) -+ -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask, -+ int init_or_cleanup) -+{ -+ int err; -+ -+ err = 0; -+ if (!init_or_cleanup) -+ goto cleanup; -+ -+ /* init part */ -+#if defined(CONFIG_NETFILTER_XTABLES) || \ -+ defined(CONFIG_NETFILTER_XTABLES_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, -+ x_tables, init_xtables, ()); -+ if (err < 0) -+ goto err_xtables; -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, -+ xt_tcpudp, init_xt_tcpudp, ()); -+ if (err < 0) -+ goto err_xt_tcpudp; -+#endif -+#if defined(CONFIG_IP_NF_IPTABLES) || \ -+ defined(CONFIG_IP_NF_IPTABLES_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, -+ ip_tables, init_iptables, ()); -+ if (err < 0) -+ goto err_iptables; -+#endif -+#if defined(CONFIG_IP_NF_CONNTRACK) || \ -+ defined(CONFIG_IP_NF_CONNTRACK_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK, -+ ip_conntrack, init_iptable_conntrack, ()); -+ if (err < 0) -+ goto err_iptable_conntrack; -+#endif -+#if defined(CONFIG_IP_NF_FTP) || \ -+ defined(CONFIG_IP_NF_FTP_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_FTP, -+ ip_conntrack_ftp, init_iptable_ftp, ()); -+ if (err < 0) -+ goto err_iptable_ftp; -+#endif -+#if defined(CONFIG_IP_NF_IRC) || \ -+ defined(CONFIG_IP_NF_IRC_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_IRC, -+ ip_conntrack_irc, init_iptable_irc, ()); -+ if (err < 0) -+ goto err_iptable_irc; -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_CONNTRACK, -+ xt_conntrack, init_xt_conntrack_match, ()); -+ if (err < 0) -+ goto err_xt_conntrack_match; -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_STATE) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_STATE_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_STATE, -+ xt_state, init_xt_state, ()); -+ if (err < 0) -+ goto err_xt_state; -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_HELPER) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_HELPER_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_HELPER, -+ xt_helper, init_xt_helper, ()); -+ if (err < 0) -+ goto err_xt_helper; -+#endif -+#if defined(CONFIG_IP_NF_NAT) || \ -+ defined(CONFIG_IP_NF_NAT_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, -+ ip_nat, ip_nat_init, ()); -+ if (err < 0) -+ goto err_iptable_nat; -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, -+ iptable_nat, init_iptable_nat, ()); -+ if (err < 0) -+ goto err_iptable_nat2; -+#endif -+#if defined(CONFIG_IP_NF_NAT_FTP) || \ -+ defined(CONFIG_IP_NF_NAT_FTP_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_FTP, -+ ip_nat_ftp, init_iptable_nat_ftp, ()); -+ if (err < 0) -+ goto err_iptable_nat_ftp; -+#endif -+#if defined(CONFIG_IP_NF_NAT_IRC) || \ -+ defined(CONFIG_IP_NF_NAT_IRC_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_IRC, -+ ip_nat_irc, init_iptable_nat_irc, ()); -+ if (err < 0) -+ goto err_iptable_nat_irc; -+#endif -+#if defined(CONFIG_IP_NF_FILTER) || \ -+ defined(CONFIG_IP_NF_FILTER_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER, -+ iptable_filter, init_iptable_filter, ()); -+ if (err < 0) -+ goto err_iptable_filter; -+#endif -+#if defined(CONFIG_IP_NF_MANGLE) || \ -+ defined(CONFIG_IP_NF_MANGLE_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE, -+ iptable_mangle, init_iptable_mangle, ()); -+ if (err < 0) -+ goto err_iptable_mangle; -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_LIMIT) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_LIMIT_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_LIMIT, -+ xt_limit, init_xt_limit, ()); -+ if (err < 0) -+ goto err_xt_limit; -+#endif -+#if defined(CONFIG_IP_NF_MATCH_MULTIPORT) || \ -+ defined(CONFIG_IP_NF_MATCH_MULTIPORT_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_MULTIPORT, -+ ipt_multiport, init_iptable_multiport, ()); -+ if (err < 0) -+ goto err_iptable_multiport; -+#endif -+#if defined(CONFIG_IP_NF_MATCH_TOS) || \ -+ defined(CONFIG_IP_NF_MATCH_TOS_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TOS, -+ ipt_tos, init_iptable_tos, ()); -+ if (err < 0) -+ goto err_iptable_tos; -+#endif -+#if defined(CONFIG_IP_NF_TARGET_TOS) || \ -+ defined(CONFIG_IP_NF_TARGET_TOS_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_TOS, -+ ipt_TOS, init_iptable_TOS, ()); -+ if (err < 0) -+ goto err_iptable_TOS; -+#endif -+#if defined(CONFIG_IP_NF_TARGET_REJECT) || \ -+ defined(CONFIG_IP_NF_TARGET_REJECT_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REJECT, -+ ipt_REJECT, init_iptable_REJECT, ()); -+ if (err < 0) -+ goto err_iptable_REJECT; -+#endif -+#if defined(CONFIG_IP_NF_TARGET_TCPMSS) || \ -+ defined(CONFIG_IP_NF_TARGET_TCPMSS_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_TCPMSS, -+ ipt_TCPMSS, init_iptable_TCPMSS, ()); -+ if (err < 0) -+ goto err_iptable_TCPMSS; -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TCPMSS, -+ xt_tcpmss, init_xt_tcpmss, ()); -+ if (err < 0) -+ goto err_xt_tcpmss; -+#endif -+#if defined(CONFIG_IP_NF_MATCH_TTL) || \ -+ defined(CONFIG_IP_NF_MATCH_TTL_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TTL, -+ ipt_ttl, init_iptable_ttl, ()); -+ if (err < 0) -+ goto err_iptable_ttl; -+#endif -+#if defined(CONFIG_IP_NF_TARGET_LOG) || \ -+ defined(CONFIG_IP_NF_TARGET_LOG_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_LOG, -+ ipt_LOG, init_iptable_LOG, ()); -+ if (err < 0) -+ goto err_iptable_LOG; -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_LENGTH) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_LENGTH_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_LENGTH, -+ xt_length, init_xt_length, ()); -+ if (err < 0) -+ goto err_xt_length; -+#endif -+#if defined(CONFIG_IP_NF_TARGET_REDIRECT) || \ -+ defined(CONFIG_IP_NF_TARGET_REDIRECT_MODULE) -+ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REDIRECT, -+ ipt_REDIRECT, init_iptable_REDIRECT, ()); -+ if (err < 0) -+ goto err_iptable_REDIRECT; -+#endif -+ return 0; -+ -+/* ------------------------------------------------------------------------- */ -+ -+cleanup: -+#if defined(CONFIG_IP_NF_TARGET_REDIRECT) || \ -+ defined(CONFIG_IP_NF_TARGET_REDIRECT_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REDIRECT, -+ ipt_REDIRECT, fini_iptable_REDIRECT, ()); -+err_iptable_REDIRECT: -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_LENGTH) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_LENGTH_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_LENGTH, -+ xt_length, fini_xt_length, ()); -+err_xt_length: -+#endif -+#if defined(CONFIG_IP_NF_TARGET_LOG) || \ -+ defined(CONFIG_IP_NF_TARGET_LOG_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_LOG, -+ ipt_LOG, fini_iptable_LOG, ()); -+err_iptable_LOG: -+#endif -+#if defined(CONFIG_IP_NF_MATCH_TTL) || \ -+ defined(CONFIG_IP_NF_MATCH_TTL_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TTL, -+ ipt_ttl, fini_iptable_ttl, ()); -+err_iptable_ttl: -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TCPMSS, -+ xt_tcpmss, fini_xt_tcpmss, ()); -+err_xt_tcpmss: -+#endif -+#if defined(CONFIG_IP_NF_TARGET_TCPMSS) || \ -+ defined(CONFIG_IP_NF_TARGET_TCPMSS_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_TCPMSS, -+ ipt_TCPMSS, fini_iptable_TCPMSS, ()); -+err_iptable_TCPMSS: -+#endif -+#if defined(CONFIG_IP_NF_TARGET_REJECT) || \ -+ defined(CONFIG_IP_NF_TARGET_REJECT_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REJECT, -+ ipt_REJECT, fini_iptable_REJECT, ()); -+err_iptable_REJECT: -+#endif -+#if defined(CONFIG_IP_NF_TARGET_TOS) || \ -+ defined(CONFIG_IP_NF_TARGET_TOS_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_TOS, -+ ipt_TOS, fini_iptable_TOS, ()); -+err_iptable_TOS: -+#endif -+#if defined(CONFIG_IP_NF_MATCH_TOS) || \ -+ defined(CONFIG_IP_NF_MATCH_TOS_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TOS, -+ ipt_tos, fini_iptable_tos, ()); -+err_iptable_tos: -+#endif -+#if defined(CONFIG_IP_NF_MATCH_MULTIPORT) || \ -+ defined(CONFIG_IP_NF_MATCH_MULTIPORT_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_MULTIPORT, -+ ipt_multiport, fini_iptable_multiport, ()); -+err_iptable_multiport: -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_LIMIT) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_LIMIT_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_LIMIT, -+ xt_limit, fini_xt_limit, ()); -+err_xt_limit: -+#endif -+#if defined(CONFIG_IP_NF_MANGLE) || \ -+ defined(CONFIG_IP_NF_MANGLE_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, -+ iptable_mangle, fini_iptable_mangle, ()); -+err_iptable_mangle: -+#endif -+#if defined(CONFIG_IP_NF_FILTER) || \ -+ defined(CONFIG_IP_NF_FILTER_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, -+ iptable_filter, fini_iptable_filter, ()); -+err_iptable_filter: -+#endif -+#if defined(CONFIG_IP_NF_NAT_IRC) || \ -+ defined(CONFIG_IP_NF_NAT_IRC_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_IRC, -+ ip_nat_irc, fini_iptable_nat_irc, ()); -+err_iptable_nat_irc: -+#endif -+#if defined(CONFIG_IP_NF_NAT_FTP) || \ -+ defined(CONFIG_IP_NF_NAT_FTP_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_FTP, -+ ip_nat_ftp, fini_iptable_nat_ftp, ()); -+err_iptable_nat_ftp: -+#endif -+#if defined(CONFIG_IP_NF_NAT) || \ -+ defined(CONFIG_IP_NF_NAT_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, -+ iptable_nat, fini_iptable_nat, ()); -+err_iptable_nat2: -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, -+ ip_nat, ip_nat_cleanup, ()); -+err_iptable_nat: -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_HELPER) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_HELPER_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_HELPER, -+ xt_helper, fini_xt_helper, ()); -+err_xt_helper: -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_STATE) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_STATE_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_STATE, -+ xt_state, fini_xt_state, ()); -+err_xt_state: -+#endif -+#if defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) || \ -+ defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_CONNTRACK, -+ xt_conntrack, fini_xt_conntrack_match, ()); -+err_xt_conntrack_match: -+#endif -+#if defined(CONFIG_IP_NF_IRC) || \ -+ defined(CONFIG_IP_NF_IRC_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_IRC, -+ ip_conntrack_irc, fini_iptable_irc, ()); -+err_iptable_irc: -+#endif -+#if defined(CONFIG_IP_NF_FTP) || \ -+ defined(CONFIG_IP_NF_FTP_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_FTP, -+ ip_conntrack_ftp, fini_iptable_ftp, ()); -+err_iptable_ftp: -+#endif -+#if defined(CONFIG_IP_NF_CONNTRACK) || \ -+ defined(CONFIG_IP_NF_CONNTRACK_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK, -+ ip_conntrack, fini_iptable_conntrack, ()); -+err_iptable_conntrack: -+#endif -+#if defined(CONFIG_IP_NF_IPTABLES) || \ -+ defined(CONFIG_IP_NF_IPTABLES_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, -+ ip_tables, fini_iptables, ()); -+err_iptables: -+#endif -+#if defined(CONFIG_NETFILTER_XTABLES) || \ -+ defined(CONFIG_NETFILTER_XTABLES_MODULE) -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, -+ xt_tcpudp, fini_xt_tcpudp, ()); -+err_xt_tcpudp: -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, -+ x_tables, fini_xtables, ()); -+err_xtables: -+#endif -+ ve->_iptables_modules = 0; -+ -+ return err; -+} -+#else -+#define do_ve_iptables(ve, initmask, init) (0) -+#endif -+ -+static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask) -+{ -+ return do_ve_iptables(ve, init_mask, 1); -+} -+ -+static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask) -+{ -+ (void)do_ve_iptables(ve, init_mask, 0); -+} -+ -+static void flush_ve_iptables(struct ve_struct *ve) -+{ -+ /* -+ * flush all rule tables first, -+ * this helps us to avoid refs to freed objs -+ */ -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ip_tables, -+ ipt_flush_table, (ve->_ipt_mangle_table)); -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ip_tables, -+ ipt_flush_table, (ve->_ve_ipt_filter_pf)); -+ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ip_tables, -+ ipt_flush_table, (ve->_ip_conntrack->_ip_nat_table)); -+} -+ -+static struct list_head ve_hooks[VE_MAX_HOOKS]; -+static DECLARE_RWSEM(ve_hook_sem); -+ -+int ve_hook_register(struct ve_hook *vh) -+{ -+ struct list_head *lh; -+ struct ve_hook *tmp; -+ -+ down_write(&ve_hook_sem); -+ list_for_each(lh, &ve_hooks[vh->hooknum]) { -+ tmp = list_entry(lh, struct ve_hook, list); -+ if (vh->priority < tmp->priority) -+ break; -+ } -+ list_add_tail(&vh->list, lh); -+ up_write(&ve_hook_sem); -+ return 0; -+} -+EXPORT_SYMBOL(ve_hook_register); -+ -+void ve_hook_unregister(struct ve_hook *vh) -+{ -+ down_write(&ve_hook_sem); -+ list_del(&vh->list); -+ up_write(&ve_hook_sem); -+} -+EXPORT_SYMBOL(ve_hook_unregister); -+ -+static int ve_hook_iterate(unsigned int hooknum, void *data) -+{ -+ struct ve_hook *vh; -+ int err; -+ -+ err = 0; -+ down_read(&ve_hook_sem); -+ list_for_each_entry(vh, &ve_hooks[hooknum], list) { -+ if (!try_module_get(vh->owner)) -+ continue; -+ err = vh->hook(hooknum, data); -+ module_put(vh->owner); -+ if (err) -+ break; -+ } -+ -+ if (err) { -+ list_for_each_entry_continue_reverse(vh, -+ &ve_hooks[hooknum], list) { -+ if (!try_module_get(vh->owner)) -+ continue; -+ if (vh->undo) -+ vh->undo(hooknum, data); -+ module_put(vh->owner); -+ } -+ } -+ up_read(&ve_hook_sem); -+ return err; -+} -+ -+static void ve_hook_iterate_cleanup(unsigned int hooknum, void *data) -+{ -+ struct ve_hook *vh; -+ -+ down_read(&ve_hook_sem); -+ list_for_each_entry_reverse(vh, &ve_hooks[hooknum], list) { -+ if (!try_module_get(vh->owner)) -+ continue; -+ (void)vh->hook(hooknum, data); -+ module_put(vh->owner); -+ } -+ up_read(&ve_hook_sem); -+} -+ -+static int do_env_create(envid_t veid, unsigned int flags, u32 class_id, -+ env_create_param_t *data, int datalen) -+{ -+ struct task_struct *tsk; -+ struct ve_struct *old; -+ struct ve_struct *old_exec; -+ struct ve_struct *ve; -+ __u64 init_mask; -+ int err; -+ -+ tsk = current; -+ old = VE_TASK_INFO(tsk)->owner_env; -+ -+ if (!thread_group_leader(tsk)) -+ return -EINVAL; -+ -+ if (tsk->signal->tty) { -+ printk("ERR: VE init has controlling terminal\n"); -+ return -EINVAL; -+ } -+ if (tsk->signal->pgrp != tsk->pid || tsk->signal->session != tsk->pid) { -+ int may_setsid; -+ read_lock(&tasklist_lock); -+ may_setsid = (find_pid(PIDTYPE_PGID, tsk->pid) == NULL); -+ read_unlock(&tasklist_lock); -+ if (!may_setsid) { -+ printk("ERR: VE init is process group leader\n"); -+ return -EINVAL; -+ } -+ } -+ -+ -+ VZTRACE("%s: veid=%d classid=%d pid=%d\n", -+ __FUNCTION__, veid, class_id, current->pid); -+ -+ err = -ENOMEM; -+ ve = kmalloc(sizeof(struct ve_struct), GFP_KERNEL); -+ if (ve == NULL) -+ goto err_struct; -+ -+ init_ve_struct(ve, veid, class_id, data, tsk); -+ __module_get(THIS_MODULE); -+ down_write(&ve->op_sem); -+ if (flags & VE_LOCK) -+ ve->is_locked = 1; -+ if ((err = ve_list_add(ve)) < 0) -+ goto err_exist; -+ -+ /* this should be done before context switching */ -+ if ((err = init_printk(ve)) < 0) -+ goto err_log_wait; -+ -+ old_exec = set_exec_env(ve); -+ -+ if ((err = init_ve_sched(ve)) < 0) -+ goto err_sched; -+ -+ /* move user to VE */ -+ if ((err = set_user(0, 0)) < 0) -+ goto err_set_user; -+ -+ set_ve_root(ve, tsk); -+ -+ if ((err = init_ve_utsname(ve))) -+ goto err_utsname; -+ -+ if ((err = init_ve_mibs(ve))) -+ goto err_mibs; -+ -+ if ((err = init_ve_proc(ve))) -+ goto err_proc; -+ -+ if ((err = init_ve_sysctl(ve))) -+ goto err_sysctl; -+ -+ if ((err = init_ve_sysfs(ve))) -+ goto err_sysfs; -+ -+ if ((err = init_ve_route(ve)) < 0) -+ goto err_route; -+ -+ if ((err = init_ve_netdev())) -+ goto err_dev; -+ -+ if ((err = init_ve_tty_drivers(ve)) < 0) -+ goto err_tty; -+ -+ if ((err = init_ve_shmem(ve))) -+ goto err_shmem; -+ -+ if ((err = init_ve_devpts(ve))) -+ goto err_devpts; -+ -+ /* init SYSV IPC variables */ -+ if ((err = init_ve_ipc(ve)) < 0) -+ goto err_ipc; -+ -+ set_ve_caps(ve, tsk); -+ -+ /* It is safe to initialize netfilter here as routing initialization and -+ interface setup will be done below. This means that NO skb can be -+ passed inside. Den */ -+ /* iptables ve initialization for non ve0; -+ ve0 init is in module_init */ -+ if ((err = init_ve_netfilter()) < 0) -+ goto err_netfilter; -+ -+ init_mask = data ? data->iptables_mask : VE_IP_DEFAULT; -+ if ((err = init_ve_iptables(ve, init_mask)) < 0) -+ goto err_iptables; -+ -+ if ((err = alloc_vpid(tsk->pid, 1)) < 0) -+ goto err_vpid; -+ -+ if ((err = ve_hook_iterate(VE_HOOK_INIT, (void *)ve)) < 0) -+ goto err_ve_hook; -+ -+ /* finally: set vpids and move inside */ -+ move_task(tsk, ve, old); -+ -+ set_virt_pid(tsk, 1); -+ set_virt_tgid(tsk, 1); -+ -+ set_special_pids(tsk->pid, tsk->pid); -+ current->signal->tty_old_pgrp = 0; -+ set_virt_pgid(tsk, 1); -+ set_virt_sid(tsk, 1); -+ -+ ve->is_running = 1; -+ up_write(&ve->op_sem); -+ -+ printk(KERN_INFO "VPS: %d: started\n", veid); -+ return veid; -+ -+err_ve_hook: -+ free_vpid(1, ve); -+err_vpid: -+ fini_venet(ve); -+ fini_ve_iptables(ve, init_mask); -+err_iptables: -+ fini_ve_netfilter(); -+err_netfilter: -+ fini_ve_ipc(ve); -+err_ipc: -+ fini_ve_devpts(ve); -+err_devpts: -+ fini_ve_shmem(ve); -+err_shmem: -+ fini_ve_tty_drivers(ve); -+err_tty: -+ fini_ve_netdev(); -+err_dev: -+ fini_ve_route(ve); -+err_route: -+ fini_ve_sysfs(ve); -+err_sysfs: -+ fini_ve_sysctl(ve); -+err_sysctl: -+ fini_ve_proc(ve); -+err_proc: -+ do_clean_devperms(ve->veid); /* register procfs adds devperms */ -+ fini_ve_mibs(ve); -+err_mibs: -+ /* free_ve_utsname() is called inside real_put_ve() */ ; -+err_utsname: -+ /* It is safe to restore current->envid here because -+ * ve_fairsched_detach does not use current->envid. */ -+ /* Really fairsched code uses current->envid in sys_fairsched_mknod -+ * only. It is correct if sys_fairsched_mknod is called from -+ * userspace. If sys_fairsched_mknod is called from -+ * ve_fairsched_attach, then node->envid and node->parent_node->envid -+ * are explicitly set to valid value after the call. */ -+ /* FIXME */ -+ VE_TASK_INFO(tsk)->owner_env = old; -+ VE_TASK_INFO(tsk)->exec_env = old_exec; -+ /* move user back */ -+ if (set_user(0, 0) < 0) -+ printk(KERN_WARNING"Can't restore UID\n"); -+ -+err_set_user: -+ fini_ve_sched(ve); -+err_sched: -+ (void)set_exec_env(old_exec); -+ -+ /* we can jump here having incorrect envid */ -+ VE_TASK_INFO(tsk)->owner_env = old; -+ fini_printk(ve); -+err_log_wait: -+ ve_list_del(ve); -+ up_write(&ve->op_sem); -+ -+ real_put_ve(ve); -+err_struct: -+ printk(KERN_INFO "VPS: %d: failed to start with err=%d\n", veid, err); -+ return err; -+ -+err_exist: -+ kfree(ve); -+ goto err_struct; -+} -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * VE start/stop callbacks -+ * -+ ********************************************************************** -+ **********************************************************************/ -+ -+int real_env_create(envid_t veid, unsigned flags, u32 class_id, -+ env_create_param_t *data, int datalen) -+{ -+ int status; -+ struct ve_struct *ve; -+ -+ if (!flags) { -+ status = get_exec_env()->veid; -+ goto out; -+ } -+ -+ status = -EPERM; -+ if (!capable(CAP_SETVEID)) -+ goto out; -+ -+ status = -EINVAL; -+ if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE))) -+ goto out; -+ -+ status = -EINVAL; -+ ve = get_ve_by_id(veid); -+ if (ve) { -+ if (flags & VE_TEST) { -+ status = 0; -+ goto out_put; -+ } -+ if (flags & VE_EXCLUSIVE) { -+ status = -EACCES; -+ goto out_put; -+ } -+ if (flags & VE_CREATE) { -+ flags &= ~VE_CREATE; -+ flags |= VE_ENTER; -+ } -+ } else { -+ if (flags & (VE_TEST|VE_ENTER)) { -+ status = -ESRCH; -+ goto out; -+ } -+ } -+ -+ if (flags & VE_CREATE) { -+ status = do_env_create(veid, flags, class_id, data, datalen); -+ goto out; -+ } else if (flags & VE_ENTER) -+ status = do_env_enter(ve, flags); -+ -+ /* else: returning EINVAL */ -+ -+out_put: -+ real_put_ve(ve); -+out: -+ return status; -+} -+ -+static int do_env_enter(struct ve_struct *ve, unsigned int flags) -+{ -+ struct task_struct *tsk = current; -+ int err; -+ -+ VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid); -+ -+ err = -EBUSY; -+ down_read(&ve->op_sem); -+ if (!ve->is_running) -+ goto out_up; -+ if (ve->is_locked && !(flags & VE_SKIPLOCK)) -+ goto out_up; -+ -+#ifdef CONFIG_FAIRSCHED -+ err = sys_fairsched_mvpr(current->pid, ve->veid); -+ if (err) -+ goto out_up; -+#endif -+ -+ ve_sched_attach(ve); -+ move_task(current, ve, VE_TASK_INFO(tsk)->owner_env); -+ err = VE_TASK_INFO(tsk)->owner_env->veid; -+ -+out_up: -+ up_read(&ve->op_sem); -+ return err; -+} -+ -+static void env_cleanup(struct ve_struct *ve) -+{ -+ struct ve_struct *old_ve; -+ -+ VZTRACE("real_do_env_cleanup\n"); -+ -+ down_read(&ve->op_sem); -+ old_ve = set_exec_env(ve); -+ -+ ve_hook_iterate_cleanup(VE_HOOK_FINI, (void *)ve); -+ -+ fini_venet(ve); -+ -+ /* no new packets in flight beyond this point */ -+ synchronize_net(); -+ /* skb hold dst_entry, and in turn lies in the ip fragment queue */ -+ ip_fragment_cleanup(ve); -+ -+ fini_ve_netdev(); -+ fini_ve_route(ve); -+ -+ /* kill iptables */ -+ /* No skb belonging to VE can exist at this point as unregister_netdev -+ is an operation awaiting until ALL skb's gone */ -+ flush_ve_iptables(ve); -+ fini_ve_iptables(ve, ve->_iptables_modules); -+ fini_ve_netfilter(); -+ -+ ve_ipc_cleanup(); -+ -+ fini_ve_sched(ve); -+ do_clean_devperms(ve->veid); -+ -+ fini_ve_devpts(ve); -+ fini_ve_shmem(ve); -+ fini_ve_sysfs(ve); -+ unregister_ve_tty_drivers(ve); -+ fini_ve_sysctl(ve); -+ fini_ve_proc(ve); -+ -+ fini_ve_mibs(ve); -+ -+ (void)set_exec_env(old_ve); -+ fini_printk(ve); /* no printk can happen in ve context anymore */ -+ -+ ve_list_del(ve); -+ up_read(&ve->op_sem); -+ -+ real_put_ve(ve); -+} -+ -+static struct list_head ve_cleanup_list; -+static spinlock_t ve_cleanup_lock; -+ -+static DECLARE_COMPLETION(vzmond_complete); -+static struct task_struct *vzmond_thread; -+static volatile int stop_vzmond; -+ -+void real_do_env_cleanup(struct ve_struct *ve) -+{ -+ spin_lock(&ve_cleanup_lock); -+ list_add_tail(&ve->cleanup_list, &ve_cleanup_list); -+ spin_unlock(&ve_cleanup_lock); -+ wake_up_process(vzmond_thread); -+} -+ -+static void do_pending_env_cleanups(void) -+{ -+ struct ve_struct *ve; -+ -+ spin_lock(&ve_cleanup_lock); -+ while (1) { -+ if (list_empty(&ve_cleanup_list) || need_resched()) -+ break; -+ ve = list_entry(ve_cleanup_list.next, struct ve_struct, -+ cleanup_list); -+ list_del(&ve->cleanup_list); -+ spin_unlock(&ve_cleanup_lock); -+ env_cleanup(ve); -+ spin_lock(&ve_cleanup_lock); -+ } -+ spin_unlock(&ve_cleanup_lock); -+} -+ -+static int have_pending_cleanups(void) -+{ -+ return !list_empty(&ve_cleanup_list); -+} -+ -+static int vzmond(void *arg) -+{ -+ daemonize("vzmond"); -+ vzmond_thread = current; -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ while (!stop_vzmond) { -+ schedule(); -+ try_to_freeze(); -+ if (signal_pending(current)) -+ flush_signals(current); -+ -+ do_pending_env_cleanups(); -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (have_pending_cleanups()) -+ __set_current_state(TASK_RUNNING); -+ } -+ -+ __set_task_state(current, TASK_RUNNING); -+ complete_and_exit(&vzmond_complete, 0); -+} -+ -+static int __init init_vzmond(void) -+{ -+ INIT_LIST_HEAD(&ve_cleanup_list); -+ spin_lock_init(&ve_cleanup_lock); -+ stop_vzmond = 0; -+ return kernel_thread(vzmond, NULL, 0); -+} -+ -+static void fini_vzmond(void) -+{ -+ stop_vzmond = 1; -+ wake_up_process(vzmond_thread); -+ wait_for_completion(&vzmond_complete); -+ WARN_ON(!list_empty(&ve_cleanup_list)); -+} -+ -+void real_do_env_free(struct ve_struct *ve) -+{ -+ VZTRACE("real_do_env_free\n"); -+ -+ ve_ipc_free(ve); /* free SYSV IPC resources */ -+ free_ve_tty_drivers(ve); -+ free_ve_utsname(ve); -+ free_ve_sysctl(ve); /* free per ve sysctl data */ -+ free_ve_filesystems(ve); -+ printk(KERN_INFO "VPS: %d: stopped\n", VEID(ve)); -+ kfree(ve); -+ -+ module_put(THIS_MODULE); -+} -+EXPORT_SYMBOL(real_do_env_free); -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * VE TTY handling -+ * -+ ********************************************************************** -+ **********************************************************************/ -+ -+DCL_VE_OWNER(TTYDRV, struct tty_driver, owner_env) -+ -+static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base, -+ struct ve_struct *ve) -+{ -+ size_t size; -+ struct tty_driver *driver; -+ -+ driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL); -+ if (!driver) -+ goto out; -+ -+ memcpy(driver, base, sizeof(struct tty_driver)); -+ -+ driver->driver_state = NULL; -+ -+ size = base->num * 3 * sizeof(void *); -+ if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { -+ void **p; -+ p = kmalloc(size, GFP_KERNEL); -+ if (!p) -+ goto out_free; -+ memset(p, 0, size); -+ driver->ttys = (struct tty_struct **)p; -+ driver->termios = (struct termios **)(p + driver->num); -+ driver->termios_locked = (struct termios **)(p + driver->num * 2); -+ } else { -+ driver->ttys = NULL; -+ driver->termios = NULL; -+ driver->termios_locked = NULL; -+ } -+ -+ SET_VE_OWNER_TTYDRV(driver, ve); -+ driver->flags |= TTY_DRIVER_INSTALLED; -+ -+ return driver; -+ -+out_free: -+ kfree(driver); -+out: -+ return NULL; -+} -+ -+static void free_ve_tty_driver(struct tty_driver *driver) -+{ -+ if (!driver) -+ return; -+ -+ clear_termios(driver); -+ kfree(driver->ttys); -+ kfree(driver); -+} -+ -+static int alloc_ve_tty_drivers(struct ve_struct* ve) -+{ -+#ifdef CONFIG_LEGACY_PTYS -+ /* Traditional BSD devices */ -+ ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve); -+ if (!ve->pty_driver) -+ goto out_mem; -+ -+ ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve); -+ if (!ve->pty_slave_driver) -+ goto out_mem; -+ -+ ve->pty_driver->other = ve->pty_slave_driver; -+ ve->pty_slave_driver->other = ve->pty_driver; -+#endif -+ -+#ifdef CONFIG_UNIX98_PTYS -+ ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve); -+ if (!ve->ptm_driver) -+ goto out_mem; -+ -+ ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve); -+ if (!ve->pts_driver) -+ goto out_mem; -+ -+ ve->ptm_driver->other = ve->pts_driver; -+ ve->pts_driver->other = ve->ptm_driver; -+ -+ ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys), GFP_KERNEL); -+ if (!ve->allocated_ptys) -+ goto out_mem; -+ idr_init(ve->allocated_ptys); -+#endif -+ return 0; -+ -+out_mem: -+ free_ve_tty_drivers(ve); -+ return -ENOMEM; -+} -+ -+static void free_ve_tty_drivers(struct ve_struct* ve) -+{ -+#ifdef CONFIG_LEGACY_PTYS -+ free_ve_tty_driver(ve->pty_driver); -+ free_ve_tty_driver(ve->pty_slave_driver); -+ ve->pty_driver = ve->pty_slave_driver = NULL; -+#endif -+#ifdef CONFIG_UNIX98_PTYS -+ free_ve_tty_driver(ve->ptm_driver); -+ free_ve_tty_driver(ve->pts_driver); -+ kfree(ve->allocated_ptys); -+ ve->ptm_driver = ve->pts_driver = NULL; -+ ve->allocated_ptys = NULL; -+#endif -+} -+ -+static inline void __register_tty_driver(struct tty_driver *driver) -+{ -+ list_add(&driver->tty_drivers, &tty_drivers); -+} -+ -+static inline void __unregister_tty_driver(struct tty_driver *driver) -+{ -+ if (!driver) -+ return; -+ list_del(&driver->tty_drivers); -+} -+ -+static int register_ve_tty_drivers(struct ve_struct* ve) -+{ -+ write_lock_irq(&tty_driver_guard); -+#ifdef CONFIG_UNIX98_PTYS -+ __register_tty_driver(ve->ptm_driver); -+ __register_tty_driver(ve->pts_driver); -+#endif -+#ifdef CONFIG_LEGACY_PTYS -+ __register_tty_driver(ve->pty_driver); -+ __register_tty_driver(ve->pty_slave_driver); -+#endif -+ write_unlock_irq(&tty_driver_guard); -+ -+ return 0; -+} -+ -+static void unregister_ve_tty_drivers(struct ve_struct* ve) -+{ -+ VZTRACE("unregister_ve_tty_drivers\n"); -+ -+ write_lock_irq(&tty_driver_guard); -+ __unregister_tty_driver(ve->pty_driver); -+ __unregister_tty_driver(ve->pty_slave_driver); -+#ifdef CONFIG_UNIX98_PTYS -+ __unregister_tty_driver(ve->ptm_driver); -+ __unregister_tty_driver(ve->pts_driver); -+#endif -+ write_unlock_irq(&tty_driver_guard); -+} -+ -+static int init_ve_tty_drivers(struct ve_struct *ve) -+{ -+ int err; -+ -+ if ((err = alloc_ve_tty_drivers(ve))) -+ goto err_ttyalloc; -+ if ((err = register_ve_tty_drivers(ve))) -+ goto err_ttyreg; -+ return 0; -+ -+err_ttyreg: -+ free_ve_tty_drivers(ve); -+err_ttyalloc: -+ return err; -+} -+ -+static void fini_ve_tty_drivers(struct ve_struct *ve) -+{ -+ unregister_ve_tty_drivers(ve); -+ free_ve_tty_drivers(ve); -+} -+ -+/* -+ * Free the termios and termios_locked structures because -+ * we don't want to get memory leaks when modular tty -+ * drivers are removed from the kernel. -+ */ -+static void clear_termios(struct tty_driver *driver) -+{ -+ int i; -+ struct termios *tp; -+ -+ if (driver->termios == NULL) -+ return; -+ for (i = 0; i < driver->num; i++) { -+ tp = driver->termios[i]; -+ if (tp) { -+ driver->termios[i] = NULL; -+ kfree(tp); -+ } -+ tp = driver->termios_locked[i]; -+ if (tp) { -+ driver->termios_locked[i] = NULL; -+ kfree(tp); -+ } -+ } -+} -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * Pieces of VE network -+ * -+ ********************************************************************** -+ **********************************************************************/ -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#include <asm/uaccess.h> -+#include <net/sock.h> -+#include <linux/netlink.h> -+#include <linux/rtnetlink.h> -+#include <net/route.h> -+#include <net/ip_fib.h> -+#endif -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+static void ve_del_ip_addrs(struct net_device *dev) -+{ -+ struct in_device *in_dev; -+ -+ in_dev = in_dev_get(dev); -+ if (in_dev == NULL) -+ return; -+ -+ while (in_dev->ifa_list != NULL) { -+ inet_del_ifa(in_dev, &in_dev->ifa_list, 1); -+ } -+ in_dev_put(in_dev); -+} -+ -+static int ve_netdev_cleanup(struct net_device *dev, int to_ve) -+{ -+ int err; -+ -+ err = 0; -+ ve_del_ip_addrs(dev); -+ if ((dev->flags & IFF_UP) != 0) -+ err = dev_close(dev); -+ synchronize_net(); -+ dev_shutdown(dev); -+ dev_mc_discard(dev); -+ free_divert_blk(dev); -+ synchronize_net(); -+ -+ if (to_ve) -+ dev->orig_mtu = dev->mtu; -+ else { -+ int rc = dev_set_mtu(dev, dev->orig_mtu); -+ if (err == 0) -+ err = rc; -+ } -+ -+ return err; -+} -+ -+static void __ve_dev_move(struct net_device *dev, struct ve_struct *ve_src, -+ struct ve_struct *ve_dst, struct user_beancounter *exec_ub) -+{ -+ struct net_device **dp, *d; -+ struct user_beancounter *ub; -+ -+ for (d = ve_src->_net_dev_base, dp = NULL; d != NULL; -+ dp = &d->next, d = d->next) { -+ if (d == dev) { -+ hlist_del(&dev->name_hlist); -+ hlist_del(&dev->index_hlist); -+ if (ve_src->_net_dev_tail == &dev->next) -+ ve_src->_net_dev_tail = dp; -+ if (dp) -+ *dp = dev->next; -+ dev->next = NULL; -+ break; -+ } -+ } -+ *ve_dst->_net_dev_tail = dev; -+ ve_dst->_net_dev_tail = &dev->next; -+ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, ve_dst)); -+ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, ve_dst)); -+ dev->owner_env = ve_dst; -+ -+ ub = netdev_bc(dev)->exec_ub; -+ netdev_bc(dev)->exec_ub = get_beancounter(exec_ub); -+ put_beancounter(ub); -+} -+ -+static int ve_dev_add(envid_t veid, char *dev_name) -+{ -+ int err; -+ struct net_device *dev; -+ struct ve_struct *ve; -+ struct hlist_node *p; -+ -+ dev = NULL; -+ err = -ESRCH; -+ -+ ve = get_ve_by_id(veid); -+ if (ve == NULL) -+ goto out; -+ -+ rtnl_lock(); -+ -+ read_lock(&dev_base_lock); -+ hlist_for_each(p, dev_name_hash(dev_name, get_ve0())) { -+ struct net_device *d = hlist_entry(p, struct net_device, -+ name_hlist); -+ if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) { -+ dev = d; -+ break; -+ } -+ } -+ read_unlock(&dev_base_lock); -+ if (dev == NULL) -+ goto out_unlock; -+ -+ err = -EPERM; -+ if (!ve_is_dev_movable(dev)) -+ goto out_unlock; -+ -+ err = -EINVAL; -+ if (dev->flags & (IFF_SLAVE|IFF_MASTER)) -+ goto out_unlock; -+ -+ ve_netdev_cleanup(dev, 1); -+ -+ write_lock_bh(&dev_base_lock); -+ __ve_dev_move(dev, get_ve0(), ve, get_exec_ub()); -+ write_unlock_bh(&dev_base_lock); -+ -+ err = 0; -+ -+out_unlock: -+ rtnl_unlock(); -+ real_put_ve(ve); -+ -+ if (dev == NULL) -+ printk(KERN_WARNING "Device %s not found\n", dev_name); -+ -+out: -+ return err; -+} -+ -+static int ve_dev_del(envid_t veid, char *dev_name) -+{ -+ int err; -+ struct net_device *dev; -+ struct ve_struct *ve, *old_exec; -+ struct hlist_node *p; -+ -+ dev = NULL; -+ err = -ESRCH; -+ -+ ve = get_ve_by_id(veid); -+ if (ve == NULL) -+ goto out; -+ -+ rtnl_lock(); -+ -+ read_lock(&dev_base_lock); -+ hlist_for_each(p, dev_name_hash(dev_name, ve)) { -+ struct net_device *d = hlist_entry(p, struct net_device, -+ name_hlist); -+ if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) { -+ dev = d; -+ break; -+ } -+ } -+ read_unlock(&dev_base_lock); -+ if (dev == NULL) -+ goto out_unlock; -+ -+ err = -EPERM; -+ if (!ve_is_dev_movable(dev)) -+ goto out_unlock; -+ -+ old_exec = set_exec_env(ve); -+ ve_netdev_cleanup(dev, 0); -+ (void)set_exec_env(old_exec); -+ -+ write_lock_bh(&dev_base_lock); -+ __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub); -+ write_unlock_bh(&dev_base_lock); -+ -+ err = 0; -+ -+out_unlock: -+ rtnl_unlock(); -+ real_put_ve(ve); -+ -+ if (dev == NULL) -+ printk(KERN_WARNING "Device %s not found\n", dev_name); -+ -+out: -+ return err; -+} -+ -+int real_ve_dev_map(envid_t veid, int op, char *dev_name) -+{ -+ int err; -+ err = -EPERM; -+ if (!capable(CAP_SETVEID)) -+ goto out; -+ switch (op) -+ { -+ case VE_NETDEV_ADD: -+ err = ve_dev_add(veid, dev_name); -+ break; -+ case VE_NETDEV_DEL: -+ err = ve_dev_del(veid, dev_name); -+ break; -+ default: -+ err = -EINVAL; -+ break; -+ } -+out: -+ return err; -+} -+ -+static void ve_mapped_devs_cleanup(struct ve_struct *ve) -+{ -+ struct net_device *dev; -+ -+ rtnl_lock(); -+ write_lock_bh(&dev_base_lock); -+restart: -+ for (dev = ve->_net_dev_base; dev != NULL; dev = dev->next) -+ { -+ if ((dev->features & NETIF_F_VENET) || -+ (dev == ve->_loopback_dev)) /* Skip loopback dev */ -+ continue; -+ write_unlock_bh(&dev_base_lock); -+ ve_netdev_cleanup(dev, 0); -+ write_lock_bh(&dev_base_lock); -+ __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub); -+ goto restart; -+ } -+ write_unlock_bh(&dev_base_lock); -+ rtnl_unlock(); -+} -+#endif -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * VE information via /proc -+ * -+ ********************************************************************** -+ **********************************************************************/ -+#ifdef CONFIG_PROC_FS -+static int devperms_seq_show(struct seq_file *m, void *v) -+{ -+ struct devperms_struct *dp; -+ char dev_s[32], type_c; -+ unsigned use, type; -+ dev_t dev; -+ -+ dp = (struct devperms_struct *)v; -+ if (dp == (struct devperms_struct *)1L) { -+ seq_printf(m, "Version: 2.7\n"); -+ return 0; -+ } -+ -+ use = dp->type & VE_USE_MASK; -+ type = dp->type & S_IFMT; -+ dev = dp->dev; -+ -+ if ((use | VE_USE_MINOR) == use) -+ snprintf(dev_s, sizeof(dev_s), "%d:%d", MAJOR(dev), MINOR(dev)); -+ else if ((use | VE_USE_MAJOR) == use) -+ snprintf(dev_s, sizeof(dev_s), "%d:*", MAJOR(dp->dev)); -+ else -+ snprintf(dev_s, sizeof(dev_s), "*:*"); -+ -+ if (type == S_IFCHR) -+ type_c = 'c'; -+ else if (type == S_IFBLK) -+ type_c = 'b'; -+ else -+ type_c = '?'; -+ -+ seq_printf(m, "%10u %c %03o %s\n", dp->veid, type_c, dp->mask, dev_s); -+ return 0; -+} -+ -+static void *devperms_seq_start(struct seq_file *m, loff_t *pos) -+{ -+ loff_t cpos; -+ long slot; -+ struct devperms_struct *dp; -+ -+ cpos = *pos; -+ read_lock(&devperms_hash_guard); -+ if (cpos-- == 0) -+ return (void *)1L; -+ -+ for (slot = 0; slot < DEVPERMS_HASH_SZ; slot++) -+ for (dp = devperms_hash[slot]; dp; dp = dp->devhash_next) -+ if (cpos-- == 0) { -+ m->private = (void *)slot; -+ return dp; -+ } -+ return NULL; -+} -+ -+static void *devperms_seq_next(struct seq_file *m, void *v, loff_t *pos) -+{ -+ long slot; -+ struct devperms_struct *dp; -+ -+ dp = (struct devperms_struct *)v; -+ -+ if (dp == (struct devperms_struct *)1L) -+ slot = 0; -+ else if (dp->devhash_next == NULL) -+ slot = (long)m->private + 1; -+ else { -+ (*pos)++; -+ return dp->devhash_next; -+ } -+ -+ for (; slot < DEVPERMS_HASH_SZ; slot++) -+ if (devperms_hash[slot]) { -+ (*pos)++; -+ m->private = (void *)slot; -+ return devperms_hash[slot]; -+ } -+ return NULL; -+} -+ -+static void devperms_seq_stop(struct seq_file *m, void *v) -+{ -+ read_unlock(&devperms_hash_guard); -+} -+ -+static struct seq_operations devperms_seq_op = { -+ .start = devperms_seq_start, -+ .next = devperms_seq_next, -+ .stop = devperms_seq_stop, -+ .show = devperms_seq_show, -+}; -+ -+static int devperms_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &devperms_seq_op); -+} -+ -+static struct file_operations proc_devperms_ops = { -+ .open = devperms_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+#if BITS_PER_LONG == 32 -+#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21) -+#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n" -+#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n" -+#else -+#define VESTAT_LINE_WIDTH (12 * 21) -+#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n" -+#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n" -+#endif -+ -+static int vestat_seq_show(struct seq_file *m, void *v) -+{ -+ struct ve_struct *ve = (struct ve_struct *)v; -+ struct ve_struct *curve; -+ int cpu; -+ unsigned long user_ve, nice_ve, system_ve, uptime; -+ cycles_t uptime_cycles, idle_time, strv_time, used; -+ -+ curve = get_exec_env(); -+ if (ve == ve_list_head || -+ (!ve_is_super(curve) && ve == curve)) { -+ /* print header */ -+ seq_printf(m, "%-*s\n", -+ VESTAT_LINE_WIDTH - 1, -+ "Version: 2.2"); -+ seq_printf(m, VESTAT_HEAD_FMT, "VEID", -+ "user", "nice", "system", -+ "uptime", "idle", -+ "strv", "uptime", "used", -+ "maxlat", "totlat", "numsched"); -+ } -+ -+ if (ve == get_ve0()) -+ return 0; -+ -+ user_ve = nice_ve = system_ve = 0; -+ idle_time = strv_time = used = 0; -+ -+ for (cpu = 0; cpu < NR_CPUS; cpu++) { -+ struct ve_cpu_stats *st; -+ -+ st = VE_CPU_STATS(ve, cpu); -+ user_ve += st->user; -+ nice_ve += st->nice; -+ system_ve += st->system; -+ used += VE_CPU_STATS(ve, cpu)->used_time; -+ idle_time += ve_sched_get_idle_time(ve, cpu); -+ } -+ uptime_cycles = get_cycles() - ve->start_cycles; -+ uptime = jiffies - ve->start_jiffies; -+ -+ seq_printf(m, VESTAT_LINE_FMT, ve->veid, -+ user_ve, nice_ve, system_ve, -+ uptime, idle_time, -+ strv_time, uptime_cycles, used, -+ ve->sched_lat_ve.last.maxlat, -+ ve->sched_lat_ve.last.totlat, -+ ve->sched_lat_ve.last.count); -+ return 0; -+} -+ -+static void *ve_seq_start(struct seq_file *m, loff_t *pos) -+{ -+ struct ve_struct *ve, *curve; -+ loff_t l; -+ -+ curve = get_exec_env(); -+ read_lock(&ve_list_guard); -+ if (!ve_is_super(curve)) { -+ if (*pos != 0) -+ return NULL; -+ return curve; -+ } -+ for (ve = ve_list_head, l = *pos; -+ ve != NULL && l > 0; -+ ve = ve->next, l--); -+ return ve; -+} -+ -+static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) -+{ -+ struct ve_struct *ve = (struct ve_struct *)v; -+ -+ if (!ve_is_super(get_exec_env())) -+ return NULL; -+ (*pos)++; -+ return ve->next; -+} -+ -+static void ve_seq_stop(struct seq_file *m, void *v) -+{ -+ read_unlock(&ve_list_guard); -+} -+ -+static struct seq_operations vestat_seq_op = { -+ start: ve_seq_start, -+ next: ve_seq_next, -+ stop: ve_seq_stop, -+ show: vestat_seq_show -+}; -+ -+static int vestat_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &vestat_seq_op); -+} -+ -+static struct file_operations proc_vestat_operations = { -+ open: vestat_open, -+ read: seq_read, -+ llseek: seq_lseek, -+ release: seq_release -+}; -+ -+static int __init init_vecalls_proc(void) -+{ -+ struct proc_dir_entry *de; -+ -+ de = create_proc_glob_entry("vz/vestat", -+ S_IFREG|S_IRUSR, NULL); -+ if (de == NULL) { -+ /* create "vz" subdirectory, if not exist */ -+ (void) create_proc_glob_entry("vz", -+ S_IFDIR|S_IRUGO|S_IXUGO, NULL); -+ de = create_proc_glob_entry("vz/vestat", -+ S_IFREG|S_IRUSR, NULL); -+ } -+ if (de) -+ de->proc_fops = &proc_vestat_operations; -+ else -+ printk(KERN_WARNING -+ "VZMON: can't make vestat proc entry\n"); -+ -+ de = create_proc_entry("vz/devperms", S_IFREG | S_IRUSR, NULL); -+ if (de) -+ de->proc_fops = &proc_devperms_ops; -+ else -+ printk(KERN_WARNING -+ "VZMON: can't make devperms proc entry\n"); -+ return 0; -+} -+ -+static void fini_vecalls_proc(void) -+{ -+ remove_proc_entry("vz/devperms", NULL); -+ remove_proc_entry("vz/vestat", NULL); -+} -+#else -+#define init_vecalls_proc() (0) -+#define fini_vecalls_proc() do { } while (0) -+#endif /* CONFIG_PROC_FS */ -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * User ctl -+ * -+ ********************************************************************** -+ **********************************************************************/ -+ -+int vzcalls_ioctl(struct inode *, struct file *, unsigned int, unsigned long); -+static struct vzioctlinfo vzcalls = { -+ type: VZCTLTYPE, -+ func: vzcalls_ioctl, -+ owner: THIS_MODULE, -+}; -+ -+int vzcalls_ioctl(struct inode *ino, struct file *file, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err; -+ -+ err = -ENOTTY; -+ switch(cmd) { -+ case VZCTL_MARK_ENV_TO_DOWN: { -+ /* Compatibility issue */ -+ err = 0; -+ } -+ break; -+ case VZCTL_SETDEVPERMS: { -+ /* Device type was mistakenly declared as dev_t -+ * in the old user-kernel interface. -+ * That's wrong, dev_t is a kernel internal type. -+ * I use `unsigned' not having anything better in mind. -+ * 2001/08/11 SAW */ -+ struct vzctl_setdevperms s; -+ err = -EFAULT; -+ if (copy_from_user(&s, (void *)arg, sizeof(s))) -+ break; -+ err = real_setdevperms(s.veid, s.type, -+ new_decode_dev(s.dev), s.mask); -+ } -+ break; -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ case VZCTL_VE_NETDEV: { -+ struct vzctl_ve_netdev d; -+ char *s; -+ err = -EFAULT; -+ if (copy_from_user(&d, (void *)arg, sizeof(d))) -+ break; -+ err = -ENOMEM; -+ s = kmalloc(IFNAMSIZ+1, GFP_KERNEL); -+ if (s == NULL) -+ break; -+ err = -EFAULT; -+ if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) { -+ s[IFNAMSIZ] = 0; -+ err = real_ve_dev_map(d.veid, d.op, s); -+ } -+ kfree(s); -+ } -+ break; -+#endif -+ case VZCTL_ENV_CREATE: { -+ struct vzctl_env_create s; -+ err = -EFAULT; -+ if (copy_from_user(&s, (void *)arg, sizeof(s))) -+ break; -+ err = real_env_create(s.veid, s.flags, s.class_id, -+ NULL, 0); -+ } -+ break; -+ case VZCTL_ENV_CREATE_DATA: { -+ struct vzctl_env_create_data s; -+ env_create_param_t *data; -+ err = -EFAULT; -+ if (copy_from_user(&s, (void *)arg, sizeof(s))) -+ break; -+ err=-EINVAL; -+ if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN || -+ s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN || -+ s.data == 0) -+ break; -+ err = -ENOMEM; -+ data = kmalloc(sizeof(*data), GFP_KERNEL); -+ if (!data) -+ break; -+ memset(data, 0, sizeof(*data)); -+ err = -EFAULT; -+ if (copy_from_user(data, (void *)s.data, s.datalen)) -+ goto free_data; -+ err = real_env_create(s.veid, s.flags, s.class_id, -+ data, s.datalen); -+free_data: -+ kfree(data); -+ } -+ break; -+ case VZCTL_GET_CPU_STAT: { -+ struct vzctl_cpustatctl s; -+ err = -EFAULT; -+ if (copy_from_user(&s, (void *)arg, sizeof(s))) -+ break; -+ err = ve_get_cpu_stat(s.veid, s.cpustat); -+ } -+ break; -+ } -+ return err; -+} -+EXPORT_SYMBOL(real_env_create); -+ -+ -+/********************************************************************** -+ ********************************************************************** -+ * -+ * Init/exit stuff -+ * -+ ********************************************************************** -+ **********************************************************************/ -+ -+#ifdef CONFIG_VE_CALLS_MODULE -+static int __init init_vecalls_symbols(void) -+{ -+ KSYMRESOLVE(real_get_device_perms_ve); -+ KSYMRESOLVE(real_do_env_cleanup); -+ KSYMRESOLVE(real_do_env_free); -+ KSYMRESOLVE(real_update_load_avg_ve); -+ KSYMMODRESOLVE(vzmon); -+ return 0; -+} -+ -+static void fini_vecalls_symbols(void) -+{ -+ KSYMMODUNRESOLVE(vzmon); -+ KSYMUNRESOLVE(real_get_device_perms_ve); -+ KSYMUNRESOLVE(real_do_env_cleanup); -+ KSYMUNRESOLVE(real_do_env_free); -+ KSYMUNRESOLVE(real_update_load_avg_ve); -+} -+#else -+#define init_vecalls_symbols() (0) -+#define fini_vecalls_symbols() do { } while (0) -+#endif -+ -+static inline __init int init_vecalls_ioctls(void) -+{ -+ vzioctl_register(&vzcalls); -+ return 0; -+} -+ -+static inline void fini_vecalls_ioctls(void) -+{ -+ vzioctl_unregister(&vzcalls); -+} -+ -+static int __init vecalls_init(void) -+{ -+ int err; -+ int i; -+ -+ ve_list_head = get_ve0(); -+ -+ err = init_vzmond(); -+ if (err < 0) -+ goto out_vzmond; -+ -+ err = init_devperms_hash(); -+ if (err < 0) -+ goto out_perms; -+ -+ err = init_vecalls_symbols(); -+ if (err < 0) -+ goto out_sym; -+ -+ err = init_vecalls_proc(); -+ if (err < 0) -+ goto out_proc; -+ -+ err = init_vecalls_ioctls(); -+ if (err < 0) -+ goto out_ioctls; -+ -+ for (i = 0; i < VE_MAX_HOOKS; i++) -+ INIT_LIST_HEAD(&ve_hooks[i]); -+ -+ return 0; -+ -+out_ioctls: -+ fini_vecalls_proc(); -+out_proc: -+ fini_vecalls_symbols(); -+out_sym: -+ fini_devperms_hash(); -+out_perms: -+ fini_vzmond(); -+out_vzmond: -+ return err; -+} -+ -+static void vecalls_exit(void) -+{ -+ fini_vecalls_ioctls(); -+ fini_vecalls_proc(); -+ fini_vecalls_symbols(); -+ fini_devperms_hash(); -+ fini_vzmond(); -+} -+ -+EXPORT_SYMBOL(get_ve_by_id); -+EXPORT_SYMBOL(__find_ve_by_id); -+EXPORT_SYMBOL(ve_list_guard); -+EXPORT_SYMBOL(ve_list_head); -+EXPORT_SYMBOL(nr_ve); -+ -+MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); -+MODULE_DESCRIPTION("Virtuozzo Control"); -+MODULE_LICENSE("GPL v2"); -+ -+module_init(vecalls_init) -+module_exit(vecalls_exit) -diff -upr linux-2.6.16.orig/kernel/veowner.c linux-2.6.16-026test009/kernel/veowner.c ---- linux-2.6.16.orig/kernel/veowner.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/veowner.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,304 @@ -+/* -+ * kernel/veowner.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/sched.h> -+#include <linux/ve.h> -+#include <linux/ve_owner.h> -+#include <linux/ve_proto.h> -+#include <linux/ipc.h> -+#include <linux/fs.h> -+#include <linux/proc_fs.h> -+#include <linux/file.h> -+#include <linux/mm.h> -+#include <linux/delay.h> -+#include <linux/vmalloc.h> -+#include <linux/init.h> -+#include <linux/module.h> -+#include <linux/list.h> -+#include <linux/inetdevice.h> -+#include <asm/system.h> -+#include <asm/io.h> -+ -+#include <net/tcp.h> -+ -+void prepare_ve0_process(struct task_struct *tsk) -+{ -+ set_virt_pid(tsk, tsk->pid); -+ set_virt_tgid(tsk, tsk->tgid); -+ if (tsk->signal) { -+ set_virt_pgid(tsk, tsk->signal->pgrp); -+ set_virt_sid(tsk, tsk->signal->session); -+ } -+ VE_TASK_INFO(tsk)->exec_env = get_ve0(); -+ VE_TASK_INFO(tsk)->owner_env = get_ve0(); -+ VE_TASK_INFO(tsk)->sleep_time = 0; -+ VE_TASK_INFO(tsk)->wakeup_stamp = 0; -+ VE_TASK_INFO(tsk)->sched_time = 0; -+ seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock); -+ -+ if (tsk->pid) { -+ SET_VE_LINKS(tsk); -+ atomic_inc(&get_ve0()->pcounter); -+ } -+} -+ -+void prepare_ve0_loopback(void) -+{ -+ get_ve0()->_loopback_dev = &loopback_dev; -+} -+ -+/* -+ * ------------------------------------------------------------------------ -+ * proc entries -+ * ------------------------------------------------------------------------ -+ */ -+ -+#ifdef CONFIG_PROC_FS -+static void proc_move(struct proc_dir_entry *ddir, -+ struct proc_dir_entry *sdir, -+ const char *name) -+{ -+ struct proc_dir_entry **p, *q; -+ int len; -+ -+ len = strlen(name); -+ for (p = &sdir->subdir, q = *p; q != NULL; p = &q->next, q = *p) -+ if (proc_match(len, name, q)) -+ break; -+ if (q == NULL) -+ return; -+ *p = q->next; -+ q->parent = ddir; -+ q->next = ddir->subdir; -+ ddir->subdir = q; -+} -+static void prepare_proc_misc(void) -+{ -+ static char *table[] = { -+ "loadavg", -+ "uptime", -+ "meminfo", -+ "version", -+ "stat", -+ "filesystems", -+ "locks", -+ "swaps", -+ "mounts", -+ "net", -+ "cpuinfo", -+ "sysvipc", -+ "sys", -+ "fs", -+ "vz", -+ "user_beancounters", -+ "cmdline", -+ "vmstat", -+ "modules", -+ "kmsg", -+ NULL, -+ }; -+ char **p; -+ -+ for (p = table; *p != NULL; p++) -+ proc_move(&proc_root, ve0.proc_root, *p); -+} -+int prepare_proc(void) -+{ -+ struct ve_struct *envid; -+ struct proc_dir_entry *de; -+ struct proc_dir_entry *ve_root; -+ -+ envid = set_exec_env(&ve0); -+ ve_root = ve0.proc_root->subdir; -+ /* move the whole tree to be visible in VE0 only */ -+ ve0.proc_root->subdir = proc_root.subdir; -+ for (de = ve0.proc_root->subdir; de->next != NULL; de = de->next) -+ de->parent = ve0.proc_root; -+ de->parent = ve0.proc_root; -+ de->next = ve_root; -+ -+ /* move back into the global scope some specific entries */ -+ proc_root.subdir = NULL; -+ prepare_proc_misc(); -+ proc_net = proc_mkdir("net", ve0.proc_root); -+ proc_net_stat = proc_mkdir("stat", proc_net); -+ proc_mkdir("vz", 0); -+#ifdef CONFIG_SYSVIPC -+ proc_mkdir("sysvipc", 0); -+#endif -+ proc_root_fs = proc_mkdir("fs", 0); -+ /* XXX proc_tty_init(); */ -+ -+ /* XXX process inodes */ -+ -+ (void)set_exec_env(envid); -+ -+ (void)create_proc_glob_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); -+ return 0; -+} -+ -+static struct proc_dir_entry ve0_proc_root = { -+ .name = "/proc", -+ .namelen = 5, -+ .mode = S_IFDIR | S_IRUGO | S_IXUGO, -+ .nlink = 2 -+}; -+ -+void prepare_ve0_proc_root(void) -+{ -+ ve0.proc_root = &ve0_proc_root; -+} -+#endif -+ -+/* -+ * ------------------------------------------------------------------------ -+ * Virtualized sysctl -+ * ------------------------------------------------------------------------ -+ */ -+ -+static int semmin[4] = { 1, 1, 1, 1 }; -+static int semmax[4] = { 8000, INT_MAX, 1000, IPCMNI }; -+static ctl_table kern_table[] = { -+ {KERN_NODENAME, "hostname", system_utsname.nodename, 64, -+ 0644, NULL, &proc_doutsstring, &sysctl_string}, -+ {KERN_DOMAINNAME, "domainname", system_utsname.domainname, 64, -+ 0644, NULL, &proc_doutsstring, &sysctl_string}, -+#ifdef CONFIG_SYSVIPC -+#define get_ve0_field(fname) &ve0._##fname -+ {KERN_SHMMAX, "shmmax", get_ve0_field(shm_ctlmax), sizeof (size_t), -+ 0644, NULL, &proc_doulongvec_minmax }, -+ {KERN_SHMALL, "shmall", get_ve0_field(shm_ctlall), sizeof (size_t), -+ 0644, NULL, &proc_doulongvec_minmax }, -+ {KERN_SHMMNI, "shmmni", get_ve0_field(shm_ctlmni), sizeof (int), -+ 0644, NULL, &proc_dointvec_minmax, NULL, -+ NULL, &semmin[0], &semmax[3] }, -+ {KERN_MSGMAX, "msgmax", get_ve0_field(msg_ctlmax), sizeof (int), -+ 0644, NULL, &proc_dointvec }, -+ {KERN_MSGMNI, "msgmni", get_ve0_field(msg_ctlmni), sizeof (int), -+ 0644, NULL, &proc_dointvec_minmax, NULL, -+ NULL, &semmin[0], &semmax[3] }, -+ {KERN_MSGMNB, "msgmnb", get_ve0_field(msg_ctlmnb), sizeof (int), -+ 0644, NULL, &proc_dointvec }, -+ {KERN_SEM, "sem", get_ve0_field(sem_ctls), 4*sizeof (int), -+ 0644, NULL, &proc_dointvec }, -+#endif -+ {0} -+}; -+static ctl_table root_table[] = { -+ {CTL_KERN, "kernel", NULL, 0, 0555, kern_table}, -+ {0} -+}; -+extern int ip_rt_src_check; -+extern int ve_area_access_check; -+static ctl_table vz_ipv4_route_table[] = { -+ { -+ ctl_name: NET_IPV4_ROUTE_SRC_CHECK, -+ procname: "src_check", -+ data: &ip_rt_src_check, -+ maxlen: sizeof(int), -+ mode: 0644, -+ proc_handler: &proc_dointvec, -+ }, -+ { 0 } -+}; -+static ctl_table vz_ipv4_table[] = { -+ {NET_IPV4_ROUTE, "route", NULL, 0, 0555, vz_ipv4_route_table}, -+ { 0 } -+}; -+static ctl_table vz_net_table[] = { -+ {NET_IPV4, "ipv4", NULL, 0, 0555, vz_ipv4_table}, -+ { 0 } -+}; -+static ctl_table vz_fs_table[] = { -+ { -+ ctl_name: 226, -+ procname: "ve-area-access-check", -+ data: &ve_area_access_check, -+ maxlen: sizeof(int), -+ mode: 0644, -+ proc_handler: &proc_dointvec, -+ }, -+ { 0 } -+}; -+static ctl_table root_table2[] = { -+ {CTL_NET, "net", NULL, 0, 0555, vz_net_table}, -+ {CTL_FS, "fs", NULL, 0, 0555, vz_fs_table}, -+ { 0 } -+}; -+int prepare_sysctl(void) -+{ -+ struct ve_struct *envid; -+ -+ envid = set_exec_env(&ve0); -+ ve0.kern_header = register_sysctl_table(root_table, 1); -+ register_sysctl_table(root_table2, 0); -+ (void)set_exec_env(envid); -+ return 0; -+} -+ -+void prepare_ve0_sysctl(void) -+{ -+ INIT_LIST_HEAD(&ve0.sysctl_lh); -+#ifdef CONFIG_SYSCTL -+ ve0.proc_sys_root = proc_mkdir("sys", 0); -+#endif -+} -+ -+/* -+ * ------------------------------------------------------------------------ -+ * XXX init_ve_system -+ * ------------------------------------------------------------------------ -+ */ -+ -+void init_ve_system(void) -+{ -+ struct task_struct *init_entry, *p, *tsk; -+ struct ve_struct *ptr; -+ unsigned long flags; -+ int i; -+ -+ ptr = get_ve0(); -+ (void)get_ve(ptr); -+ atomic_set(&ptr->pcounter, 1); -+ -+ /* Don't forget about idle tasks */ -+ write_lock_irqsave(&tasklist_lock, flags); -+ for (i = 0; i < NR_CPUS; i++) { -+ tsk = idle_task(i); -+ if (tsk == NULL) -+ continue; -+ -+ prepare_ve0_process(tsk); -+ } -+ do_each_thread_all(p, tsk) { -+ prepare_ve0_process(tsk); -+ } while_each_thread_all(p, tsk); -+ write_unlock_irqrestore(&tasklist_lock, flags); -+ -+ init_entry = child_reaper; -+ ptr->init_entry = init_entry; -+ /* XXX: why? */ -+ cap_set_full(ptr->cap_default); -+ -+ ptr->_ipv4_devconf = &ipv4_devconf; -+ ptr->_ipv4_devconf_dflt = &ipv4_devconf_dflt; -+ -+ read_lock(&init_entry->fs->lock); -+ ptr->fs_rootmnt = init_entry->fs->rootmnt; -+ ptr->fs_root = init_entry->fs->root; -+ read_unlock(&init_entry->fs->lock); -+ -+ /* common prepares */ -+#ifdef CONFIG_PROC_FS -+ prepare_proc(); -+#endif -+ prepare_sysctl(); -+ prepare_ipc(); -+} -diff -upr linux-2.6.16.orig/kernel/vzdev.c linux-2.6.16-026test009/kernel/vzdev.c ---- linux-2.6.16.orig/kernel/vzdev.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/vzdev.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,126 @@ -+/* -+ * kernel/vzdev.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/fs.h> -+#include <linux/list.h> -+#include <linux/init.h> -+#include <linux/module.h> -+#include <linux/vzctl.h> -+#include <linux/slab.h> -+#include <linux/vmalloc.h> -+#include <linux/vzcalluser.h> -+#include <asm/uaccess.h> -+#include <asm/pgalloc.h> -+#include <linux/device.h> -+ -+#define VZCTL_MAJOR 126 -+#define VZCTL_NAME "vzctl" -+ -+MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); -+MODULE_DESCRIPTION("Virtuozzo Interface"); -+MODULE_LICENSE("GPL v2"); -+ -+static LIST_HEAD(ioctls); -+static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED; -+ -+int vzctl_ioctl(struct inode *ino, struct file *file, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err; -+ struct list_head *p; -+ struct vzioctlinfo *inf; -+ -+ err = -ENOTTY; -+ spin_lock(&ioctl_lock); -+ list_for_each(p, &ioctls) { -+ inf = list_entry(p, struct vzioctlinfo, list); -+ if (inf->type != _IOC_TYPE(cmd)) -+ continue; -+ -+ err = try_module_get(inf->owner) ? 0 : -EBUSY; -+ spin_unlock(&ioctl_lock); -+ if (!err) { -+ err = (*inf->func)(ino, file, cmd, arg); -+ module_put(inf->owner); -+ } -+ return err; -+ } -+ spin_unlock(&ioctl_lock); -+ return err; -+} -+ -+void vzioctl_register(struct vzioctlinfo *inf) -+{ -+ spin_lock(&ioctl_lock); -+ list_add(&inf->list, &ioctls); -+ spin_unlock(&ioctl_lock); -+} -+ -+void vzioctl_unregister(struct vzioctlinfo *inf) -+{ -+ spin_lock(&ioctl_lock); -+ list_del_init(&inf->list); -+ spin_unlock(&ioctl_lock); -+} -+ -+EXPORT_SYMBOL(vzioctl_register); -+EXPORT_SYMBOL(vzioctl_unregister); -+ -+/* -+ * Init/exit stuff. -+ */ -+static struct file_operations vzctl_fops = { -+ .owner = THIS_MODULE, -+ .ioctl = vzctl_ioctl, -+}; -+ -+static struct class *vzctl_class; -+ -+static void __exit vzctl_exit(void) -+{ -+ class_device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0)); -+ class_destroy(vzctl_class); -+ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); -+} -+ -+static int __init vzctl_init(void) -+{ -+ int ret; -+ struct class_device *class_err; -+ -+ ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops); -+ if (ret < 0) -+ goto out; -+ -+ vzctl_class = class_create(THIS_MODULE, "vzctl"); -+ if (IS_ERR(vzctl_class)) { -+ ret = PTR_ERR(vzctl_class); -+ goto out_cleandev; -+ } -+ -+ class_err = class_device_create(vzctl_class, NULL, MKDEV(VZCTL_MAJOR, 0), -+ NULL, VZCTL_NAME); -+ if (IS_ERR(class_err)) { -+ ret = PTR_ERR(class_err); -+ goto out_rmclass; -+ } -+ -+ goto out; -+ -+out_rmclass: -+ class_destroy(vzctl_class); -+out_cleandev: -+ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); -+out: -+ return ret; -+} -+ -+module_init(vzctl_init) -+module_exit(vzctl_exit); -diff -upr linux-2.6.16.orig/kernel/vzwdog.c linux-2.6.16-026test009/kernel/vzwdog.c ---- linux-2.6.16.orig/kernel/vzwdog.c 2006-04-19 15:02:49.000000000 +0400 -+++ linux-2.6.16-026test009/kernel/vzwdog.c 2006-04-19 15:02:12.000000000 +0400 -@@ -0,0 +1,278 @@ -+/* -+ * kernel/vzwdog.c -+ * -+ * Copyright (C) 2000-2005 SWsoft -+ * All rights reserved. -+ * -+ * Licensing governed by "linux/COPYING.SWsoft" file. -+ * -+ */ -+ -+#include <linux/sched.h> -+#include <linux/fs.h> -+#include <linux/list.h> -+#include <linux/ctype.h> -+#include <linux/kobject.h> -+#include <linux/genhd.h> -+#include <linux/module.h> -+#include <linux/init.h> -+#include <linux/kernel.h> -+#include <linux/kernel_stat.h> -+#include <linux/smp_lock.h> -+#include <linux/errno.h> -+#include <linux/suspend.h> -+#include <linux/ve.h> -+#include <linux/vzstat.h> -+ -+/* Staff regading kernel thread polling VE validity */ -+static int sleep_timeout = 60; -+static pid_t wdog_thread_pid; -+static int wdog_thread_continue = 1; -+static DECLARE_COMPLETION(license_thread_exited); -+ -+extern void show_mem(void); -+extern struct ve_struct *ve_list_head; -+ -+#if 0 -+static char page[PAGE_SIZE]; -+ -+static void parse_irq_list(int len) -+{ -+ int i, k, skip; -+ for (i = 0; i < len; ) { -+ k = i; -+ while (i < len && page[i] != '\n' && page[i] != ':') -+ i++; -+ skip = 0; -+ if (i < len && page[i] != '\n') { -+ i++; /* skip ':' */ -+ while (i < len && (page[i] == ' ' || page[i] == '0')) -+ i++; -+ skip = (i < len && (page[i] < '0' || page[i] > '9')); -+ while (i < len && page[i] != '\n') -+ i++; -+ } -+ if (!skip) -+ printk("\n%.*s", i - k, page + k); -+ if (i < len) -+ i++; /* skip '\n' */ -+ } -+} -+#endif -+ -+static void show_irq_list(void) -+{ -+#if 0 -+ i = KSYMSAFECALL(int, get_irq_list, (page)); -+ parse_irq_list(i); /* Safe, zero was returned if unassigned */ -+#endif -+} -+ -+static void show_alloc_latency(void) -+{ -+ static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = { -+ "A0", -+ "L0", -+ "H0", -+ "L1", -+ "H1" -+ }; -+ int i; -+ -+ printk("lat: "); -+ for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) { -+ struct kstat_lat_struct *p; -+ cycles_t maxlat, avg0, avg1, avg2; -+ -+ p = &kstat_glob.alloc_lat[i]; -+ spin_lock_irq(&kstat_glb_lock); -+ maxlat = p->last.maxlat; -+ avg0 = p->avg[0]; -+ avg1 = p->avg[1]; -+ avg2 = p->avg[2]; -+ spin_unlock_irq(&kstat_glb_lock); -+ -+ printk("%s %Lu (%Lu %Lu %Lu)", -+ alloc_descr[i], -+ maxlat, -+ avg0, -+ avg1, -+ avg2); -+ } -+ printk("\n"); -+} -+ -+static void show_schedule_latency(void) -+{ -+ struct kstat_lat_pcpu_struct *p; -+ cycles_t maxlat, totlat, avg0, avg1, avg2; -+ unsigned long count; -+ -+ p = &kstat_glob.sched_lat; -+ spin_lock_irq(&kstat_glb_lock); -+ maxlat = p->last.maxlat; -+ totlat = p->last.totlat; -+ count = p->last.count; -+ avg0 = p->avg[0]; -+ avg1 = p->avg[1]; -+ avg2 = p->avg[2]; -+ spin_unlock_irq(&kstat_glb_lock); -+ -+ printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n", -+ maxlat, -+ totlat, -+ count, -+ avg0, -+ avg1, -+ avg2); -+} -+ -+static void show_header(void) -+{ -+ struct timeval tv; -+ -+ do_gettimeofday(&tv); -+ printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n", -+ tv.tv_sec, tv.tv_usec, -+ get_jiffies_64(), smp_processor_id()); -+#ifdef CONFIG_FAIRSCHED -+ printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n", -+ cycles_per_jiffy, HZ); -+#else -+ printk("*** jiffies_per_second %u ***\n", HZ); -+#endif -+} -+ -+static void show_pgdatinfo(void) -+{ -+ pg_data_t *pgdat; -+ -+ printk("pgdat:"); -+ for_each_pgdat(pgdat) { -+ printk(" %d: %lu,%lu,%lu,%p", -+ pgdat->node_id, -+ pgdat->node_start_pfn, -+ pgdat->node_present_pages, -+ pgdat->node_spanned_pages, -+ pgdat->node_mem_map); -+ } -+ printk("\n"); -+} -+ -+static void show_diskio(void) -+{ -+ struct gendisk *gd; -+ char buf[BDEVNAME_SIZE]; -+ -+ printk("disk_io: "); -+ -+ down_read(&block_subsys.rwsem); -+ list_for_each_entry(gd, &block_subsys.kset.list, kobj.entry) { -+ char *name; -+ name = disk_name(gd, 0, buf); -+ if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) && -+ isdigit(name[4])) -+ continue; -+ if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) && -+ isdigit(name[3])) -+ continue; -+ printk("(%u,%u) %s r(%u %u %u) w(%u %u %u)\n", -+ gd->major, gd->first_minor, -+ name, -+ disk_stat_read(gd, ios[READ]), -+ disk_stat_read(gd, sectors[READ]), -+ disk_stat_read(gd, merges[READ]), -+ disk_stat_read(gd, ios[WRITE]), -+ disk_stat_read(gd, sectors[WRITE]), -+ disk_stat_read(gd, merges[WRITE])); -+ } -+ up_read(&block_subsys.rwsem); -+ -+ printk("\n"); -+} -+ -+static void show_nrprocs(void) -+{ -+ unsigned long _nr_running, _nr_sleeping, -+ _nr_unint, _nr_zombie, _nr_dead, _nr_stopped; -+ -+ _nr_running = nr_running(); -+ _nr_unint = nr_uninterruptible(); -+ _nr_sleeping = nr_sleeping(); -+ _nr_zombie = nr_zombie; -+ _nr_dead = atomic_read(&nr_dead); -+ _nr_stopped = nr_stopped(); -+ -+ printk("VEnum: %d, proc R %lu, S %lu, D %lu, " -+ "Z %lu, X %lu, T %lu (tot %d)\n", -+ nr_ve, _nr_running, _nr_sleeping, _nr_unint, -+ _nr_zombie, _nr_dead, _nr_stopped, nr_threads); -+} -+ -+static void wdog_print(void) -+{ -+ show_header(); -+ show_irq_list(); -+ show_pgdatinfo(); -+ show_mem(); -+ show_diskio(); -+ show_schedule_latency(); -+ show_alloc_latency(); -+ show_nrprocs(); -+} -+ -+static int wdog_loop(void* data) -+{ -+ struct task_struct *tsk = current; -+ DECLARE_WAIT_QUEUE_HEAD(thread_wait_queue); -+ -+ /* -+ * This thread doesn't need any user-level access, -+ * so get rid of all our resources -+ */ -+ daemonize("wdogd"); -+ -+ spin_lock_irq(&tsk->sighand->siglock); -+ sigfillset(&tsk->blocked); -+ sigdelset(&tsk->blocked, SIGHUP); -+ recalc_sigpending(); -+ spin_unlock_irq(&tsk->sighand->siglock); -+ -+ while (wdog_thread_continue) { -+ wdog_print(); -+ interruptible_sleep_on_timeout(&thread_wait_queue, -+ sleep_timeout*HZ); -+ try_to_freeze(); -+ /* clear all signals */ -+ if (signal_pending(tsk)) -+ flush_signals(tsk); -+ } -+ -+ complete_and_exit(&license_thread_exited, 0); -+} -+ -+static int __init wdog_init(void) -+{ -+ wdog_thread_pid = kernel_thread(wdog_loop, NULL, 0); -+ if (wdog_thread_pid < 0) -+ return wdog_thread_pid; -+ -+ return 0; -+} -+ -+static void __exit wdog_exit(void) -+{ -+ wdog_thread_continue = 0; -+ if (wdog_thread_pid > 0) { -+ kill_proc(wdog_thread_pid, SIGHUP, 1); -+ wait_for_completion(&license_thread_exited); -+ } -+} -+ -+module_param(sleep_timeout, int, 0); -+MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); -+MODULE_DESCRIPTION("Virtuozzo WDOG"); -+MODULE_LICENSE("GPL v2"); -+ -+module_init(wdog_init) -+module_exit(wdog_exit) -diff -upr linux-2.6.16.orig/lib/bust_spinlocks.c linux-2.6.16-026test009/lib/bust_spinlocks.c ---- linux-2.6.16.orig/lib/bust_spinlocks.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/lib/bust_spinlocks.c 2006-04-19 15:02:11.000000000 +0400 -@@ -20,19 +20,11 @@ void bust_spinlocks(int yes) - if (yes) { - oops_in_progress = 1; - } else { -- int loglevel_save = console_loglevel; - #ifdef CONFIG_VT - unblank_screen(); - #endif - oops_in_progress = 0; -- /* -- * OK, the message is on the console. Now we call printk() -- * without oops_in_progress set so that printk() will give klogd -- * and the blanked console a poke. Hold onto your hats... -- */ -- console_loglevel = 15; /* NMI oopser may have shut the console up */ -- printk(" "); -- console_loglevel = loglevel_save; -+ wake_up_klogd(); - } - } - -diff -upr linux-2.6.16.orig/mm/filemap_xip.c linux-2.6.16-026test009/mm/filemap_xip.c ---- linux-2.6.16.orig/mm/filemap_xip.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/filemap_xip.c 2006-04-19 15:02:12.000000000 +0400 -@@ -190,7 +190,10 @@ __xip_unmap (struct address_space * mapp - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); - page_remove_rmap(page); -+ pb_remove_ref(page, mm); -+ ub_unused_privvm_inc(mm, vma); - dec_mm_counter(mm, file_rss); -+ dec_vma_rss(vma); - BUG_ON(pte_dirty(pteval)); - pte_unmap_unlock(pte, ptl); - page_cache_release(page); -diff -upr linux-2.6.16.orig/mm/fremap.c linux-2.6.16-026test009/mm/fremap.c ---- linux-2.6.16.orig/mm/fremap.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/fremap.c 2006-04-19 15:02:12.000000000 +0400 -@@ -20,6 +20,8 @@ - #include <asm/cacheflush.h> - #include <asm/tlbflush.h> - -+#include <ub/ub_vmpages.h> -+ - static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) - { -@@ -34,6 +36,7 @@ static int zap_pte(struct mm_struct *mm, - if (pte_dirty(pte)) - set_page_dirty(page); - page_remove_rmap(page); -+ pb_remove_ref(page, mm); - page_cache_release(page); - } - } else { -@@ -57,6 +60,10 @@ int install_page(struct mm_struct *mm, s - pte_t *pte; - pte_t pte_val; - spinlock_t *ptl; -+ struct page_beancounter *pbc; -+ -+ if (unlikely(pb_alloc(&pbc))) -+ goto out_nopb; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) -@@ -75,11 +82,15 @@ int install_page(struct mm_struct *mm, s - if (page_mapcount(page) > INT_MAX/2) - goto unlock; - -- if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) -+ if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) { -+ ub_unused_privvm_dec(mm, vma); - inc_mm_counter(mm, file_rss); -+ inc_vma_rss(vma); -+ } - - flush_icache_page(vma, page); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); -+ pb_add_ref(page, mm, &pbc); - page_add_file_rmap(page); - pte_val = *pte; - update_mmu_cache(vma, addr, pte_val); -@@ -87,6 +98,8 @@ int install_page(struct mm_struct *mm, s - unlock: - pte_unmap_unlock(pte, ptl); - out: -+ pb_free(&pbc); -+out_nopb: - return err; - } - EXPORT_SYMBOL(install_page); -@@ -109,7 +122,9 @@ int install_file_pte(struct mm_struct *m - - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { - update_hiwater_rss(mm); -+ ub_unused_privvm_inc(mm, vma); - dec_mm_counter(mm, file_rss); -+ dec_vma_rss(vma); - } - - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); -@@ -220,4 +235,5 @@ asmlinkage long sys_remap_file_pages(uns - - return err; - } -+EXPORT_SYMBOL_GPL(sys_remap_file_pages); - -diff -upr linux-2.6.16.orig/mm/madvise.c linux-2.6.16-026test009/mm/madvise.c ---- linux-2.6.16.orig/mm/madvise.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/madvise.c 2006-04-19 15:02:11.000000000 +0400 -@@ -168,6 +168,9 @@ static long madvise_remove(struct vm_are - return -EINVAL; - } - -+ if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) -+ return -EACCES; -+ - mapping = vma->vm_file->f_mapping; - - offset = (loff_t)(start - vma->vm_start) -diff -upr linux-2.6.16.orig/mm/memory.c linux-2.6.16-026test009/mm/memory.c ---- linux-2.6.16.orig/mm/memory.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/memory.c 2006-04-19 15:02:12.000000000 +0400 -@@ -58,6 +58,8 @@ - #include <linux/swapops.h> - #include <linux/elf.h> - -+#include <ub/ub_vmpages.h> -+ - #ifndef CONFIG_NEED_MULTIPLE_NODES - /* use the per-pgdat data instead for discontigmem - mbligh */ - unsigned long max_mapnr; -@@ -81,6 +83,7 @@ unsigned long vmalloc_earlyreserve; - EXPORT_SYMBOL(num_physpages); - EXPORT_SYMBOL(high_memory); - EXPORT_SYMBOL(vmalloc_earlyreserve); -+EXPORT_SYMBOL_GPL(empty_zero_page); - - int randomize_va_space __read_mostly = 1; - -@@ -103,18 +106,21 @@ void pgd_clear_bad(pgd_t *pgd) - pgd_ERROR(*pgd); - pgd_clear(pgd); - } -+EXPORT_SYMBOL_GPL(pgd_clear_bad); - - void pud_clear_bad(pud_t *pud) - { - pud_ERROR(*pud); - pud_clear(pud); - } -+EXPORT_SYMBOL_GPL(pud_clear_bad); - - void pmd_clear_bad(pmd_t *pmd) - { - pmd_ERROR(*pmd); - pmd_clear(pmd); - } -+EXPORT_SYMBOL_GPL(pmd_clear_bad); - - /* - * Note: this doesn't free the actual pages themselves. That -@@ -318,6 +324,7 @@ int __pte_alloc(struct mm_struct *mm, pm - spin_unlock(&mm->page_table_lock); - return 0; - } -+EXPORT_SYMBOL_GPL(__pte_alloc); - - int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) - { -@@ -418,6 +425,7 @@ struct page *vm_normal_page(struct vm_ar - */ - return pfn_to_page(pfn); - } -+EXPORT_SYMBOL_GPL(vm_normal_page); - - /* - * copy one vm_area from one task to the other. Assumes the page tables -@@ -428,7 +436,7 @@ struct page *vm_normal_page(struct vm_ar - static inline void - copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, -- unsigned long addr, int *rss) -+ unsigned long addr, int *rss, struct page_beancounter **pbc) - { - unsigned long vm_flags = vma->vm_flags; - pte_t pte = *src_pte; -@@ -471,6 +479,7 @@ copy_one_pte(struct mm_struct *dst_mm, s - if (page) { - get_page(page); - page_dup_rmap(page); -+ pb_dup_ref(page, dst_mm, pbc); - rss[!!PageAnon(page)]++; - } - -@@ -478,20 +487,36 @@ out_set_pte: - set_pte_at(dst_mm, addr, dst_pte, pte); - } - -+#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1))) -+#ifdef CONFIG_USER_RESOURCE -+#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub) -+#else -+#define same_ub(mm1, mm2) (1) -+#endif -+ - static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, -- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, -+ pmd_t *dst_pmd, pmd_t *src_pmd, -+ struct vm_area_struct *dst_vma, -+ struct vm_area_struct *vma, - unsigned long addr, unsigned long end) - { - pte_t *src_pte, *dst_pte; - spinlock_t *src_ptl, *dst_ptl; - int progress = 0; -- int rss[2]; -+ int rss[2], rss_tot; -+ struct page_beancounter *pbc; -+ int err; - -+ err = -ENOMEM; -+ pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL; - again: -+ if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr))) -+ goto out; - rss[1] = rss[0] = 0; - dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) -- return -ENOMEM; -+ goto out; -+ - src_pte = pte_offset_map_nested(src_pmd, addr); - src_ptl = pte_lockptr(src_mm, src_pmd); - spin_lock(src_ptl); -@@ -512,22 +537,32 @@ again: - progress++; - continue; - } -- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); -+ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, -+ vma, addr, rss, &pbc); - progress += 8; - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); - - spin_unlock(src_ptl); - pte_unmap_nested(src_pte - 1); -+ rss_tot = rss[0] + rss[1]; -+ add_vma_rss(dst_vma, rss_tot); -+ ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot); - add_mm_rss(dst_mm, rss[0], rss[1]); - pte_unmap_unlock(dst_pte - 1, dst_ptl); - cond_resched(); - if (addr != end) - goto again; -- return 0; -+ -+ err = 0; -+out: -+ pb_free_list(&pbc); -+ return err; - } - - static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, -- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, -+ pud_t *dst_pud, pud_t *src_pud, -+ struct vm_area_struct *dst_vma, -+ struct vm_area_struct *vma, - unsigned long addr, unsigned long end) - { - pmd_t *src_pmd, *dst_pmd; -@@ -542,14 +577,16 @@ static inline int copy_pmd_range(struct - if (pmd_none_or_clear_bad(src_pmd)) - continue; - if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, -- vma, addr, next)) -+ dst_vma, vma, addr, next)) - return -ENOMEM; - } while (dst_pmd++, src_pmd++, addr = next, addr != end); - return 0; - } - - static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, -- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, -+ pgd_t *dst_pgd, pgd_t *src_pgd, -+ struct vm_area_struct *dst_vma, -+ struct vm_area_struct *vma, - unsigned long addr, unsigned long end) - { - pud_t *src_pud, *dst_pud; -@@ -564,19 +601,20 @@ static inline int copy_pud_range(struct - if (pud_none_or_clear_bad(src_pud)) - continue; - if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, -- vma, addr, next)) -+ dst_vma, vma, addr, next)) - return -ENOMEM; - } while (dst_pud++, src_pud++, addr = next, addr != end); - return 0; - } - --int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, -- struct vm_area_struct *vma) -+int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, -+ unsigned long addr, size_t size) - { -+ struct mm_struct *dst_mm = dst_vma->vm_mm; -+ struct mm_struct *src_mm = vma->vm_mm; - pgd_t *src_pgd, *dst_pgd; - unsigned long next; -- unsigned long addr = vma->vm_start; -- unsigned long end = vma->vm_end; -+ unsigned long end = addr + size; - - /* - * Don't copy ptes where a page fault will fill them correctly. -@@ -599,11 +637,22 @@ int copy_page_range(struct mm_struct *ds - if (pgd_none_or_clear_bad(src_pgd)) - continue; - if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, -- vma, addr, next)) -+ dst_vma, vma, addr, next)) - return -ENOMEM; - } while (dst_pgd++, src_pgd++, addr = next, addr != end); - return 0; - } -+EXPORT_SYMBOL_GPL(__copy_page_range); -+ -+int copy_page_range(struct mm_struct *dst, struct mm_struct *src, -+ struct vm_area_struct *dst_vma, struct vm_area_struct *vma) -+{ -+ if (dst_vma->vm_mm != dst) -+ BUG(); -+ if (vma->vm_mm != src) -+ BUG(); -+ return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start); -+} - - static unsigned long zap_pte_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pmd_t *pmd, -@@ -615,6 +664,7 @@ static unsigned long zap_pte_range(struc - spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; -+ int rss; - - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - do { -@@ -668,6 +718,7 @@ static unsigned long zap_pte_range(struc - file_rss--; - } - page_remove_rmap(page); -+ pb_remove_ref(page, mm); - tlb_remove_page(tlb, page); - continue; - } -@@ -682,6 +733,9 @@ static unsigned long zap_pte_range(struc - pte_clear_full(mm, addr, pte, tlb->fullmm); - } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - -+ rss = -(file_rss + anon_rss); -+ ub_unused_privvm_add(mm, vma, rss); -+ sub_vma_rss(vma, rss); - add_mm_rss(mm, file_rss, anon_rss); - pte_unmap_unlock(pte - 1, ptl); - -@@ -1087,12 +1141,14 @@ int get_user_pages(struct task_struct *t - } - EXPORT_SYMBOL(get_user_pages); - --static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, -+static int zeromap_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t prot) - { - pte_t *pte; - spinlock_t *ptl; -+ struct mm_struct *mm; - -+ mm = vma->vm_mm; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (!pte) - return -ENOMEM; -@@ -1102,6 +1158,7 @@ static int zeromap_pte_range(struct mm_s - page_cache_get(page); - page_add_file_rmap(page); - inc_mm_counter(mm, file_rss); -+ inc_vma_rss(vma); - BUG_ON(!pte_none(*pte)); - set_pte_at(mm, addr, pte, zero_pte); - } while (pte++, addr += PAGE_SIZE, addr != end); -@@ -1109,35 +1166,35 @@ static int zeromap_pte_range(struct mm_s - return 0; - } - --static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, -+static inline int zeromap_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t prot) - { - pmd_t *pmd; - unsigned long next; - -- pmd = pmd_alloc(mm, pud, addr); -+ pmd = pmd_alloc(vma->vm_mm, pud, addr); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); -- if (zeromap_pte_range(mm, pmd, addr, next, prot)) -+ if (zeromap_pte_range(vma, pmd, addr, next, prot)) - return -ENOMEM; - } while (pmd++, addr = next, addr != end); - return 0; - } - --static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, -+static inline int zeromap_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t prot) - { - pud_t *pud; - unsigned long next; - -- pud = pud_alloc(mm, pgd, addr); -+ pud = pud_alloc(vma->vm_mm, pgd, addr); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); -- if (zeromap_pmd_range(mm, pud, addr, next, prot)) -+ if (zeromap_pmd_range(vma, pud, addr, next, prot)) - return -ENOMEM; - } while (pud++, addr = next, addr != end); - return 0; -@@ -1149,15 +1206,14 @@ int zeromap_page_range(struct vm_area_st - pgd_t *pgd; - unsigned long next; - unsigned long end = addr + size; -- struct mm_struct *mm = vma->vm_mm; - int err; - - BUG_ON(addr >= end); -- pgd = pgd_offset(mm, addr); -+ pgd = pgd_offset(vma->vm_mm, addr); - flush_cache_range(vma, addr, end); - do { - next = pgd_addr_end(addr, end); -- err = zeromap_pud_range(mm, pgd, addr, next, prot); -+ err = zeromap_pud_range(vma, pgd, addr, next, prot); - if (err) - break; - } while (pgd++, addr = next, addr != end); -@@ -1183,11 +1239,14 @@ pte_t * fastcall get_locked_pte(struct m - * old drivers should use this, and they needed to mark their - * pages reserved for the old functions anyway. - */ --static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) -+static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) - { - int retval; - pte_t *pte; -- spinlock_t *ptl; -+ spinlock_t *ptl; -+ struct mm_struct *mm; -+ -+ mm = vma->vm_mm; - - retval = -EINVAL; - if (PageAnon(page)) -@@ -1204,6 +1263,7 @@ static int insert_page(struct mm_struct - /* Ok, finally just insert the thing.. */ - get_page(page); - inc_mm_counter(mm, file_rss); -+ inc_vma_rss(vma); - page_add_file_rmap(page); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); - -@@ -1240,7 +1300,7 @@ int vm_insert_page(struct vm_area_struct - if (!page_count(page)) - return -EINVAL; - vma->vm_flags |= VM_INSERTPAGE; -- return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); -+ return insert_page(vma, addr, page, vma->vm_page_prot); - } - EXPORT_SYMBOL(vm_insert_page); - -@@ -1449,6 +1509,7 @@ static int do_wp_page(struct mm_struct * - struct page *old_page, *new_page; - pte_t entry; - int ret = VM_FAULT_MINOR; -+ struct page_beancounter *pbc; - - old_page = vm_normal_page(vma, address, orig_pte); - if (!old_page) -@@ -1476,6 +1537,9 @@ static int do_wp_page(struct mm_struct * - gotten: - pte_unmap_unlock(page_table, ptl); - -+ if (unlikely(pb_alloc(&pbc))) -+ goto oom_nopb; -+ - if (unlikely(anon_vma_prepare(vma))) - goto oom; - if (old_page == ZERO_PAGE(address)) { -@@ -1496,12 +1560,16 @@ gotten: - if (likely(pte_same(*page_table, orig_pte))) { - if (old_page) { - page_remove_rmap(old_page); -+ pb_remove_ref(old_page, mm); - if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); - } -- } else -+ } else { -+ ub_unused_privvm_dec(mm, vma); - inc_mm_counter(mm, anon_rss); -+ inc_vma_rss(vma); -+ } - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = mk_pte(new_page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); -@@ -1510,6 +1578,7 @@ gotten: - lazy_mmu_prot_update(entry); - lru_cache_add_active(new_page); - page_add_new_anon_rmap(new_page, vma, address); -+ pb_add_ref(new_page, mm, &pbc); - - /* Free the old page.. */ - new_page = old_page; -@@ -1519,10 +1588,13 @@ gotten: - page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); -+ pb_free(&pbc); - unlock: - pte_unmap_unlock(page_table, ptl); - return ret; - oom: -+ pb_free(&pbc); -+oom_nopb: - if (old_page) - page_cache_release(old_page); - return VM_FAULT_OOM; -@@ -1877,10 +1949,16 @@ static int do_swap_page(struct mm_struct - swp_entry_t entry; - pte_t pte; - int ret = VM_FAULT_MINOR; -+ struct page_beancounter *pbc; -+ cycles_t start; - - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) -- goto out; -+ goto out_nostat; -+ -+ if (unlikely(pb_alloc(&pbc))) -+ return VM_FAULT_OOM; - -+ start = get_cycles(); - entry = pte_to_swp_entry(orig_pte); - again: - page = lookup_swap_cache(entry); -@@ -1928,6 +2006,8 @@ again: - /* The page isn't present yet, go ahead with the fault. */ - - inc_mm_counter(mm, anon_rss); -+ inc_vma_rss(vma); -+ ub_swapin_inc(mm); - pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) { - pte = maybe_mkwrite(pte_mkdirty(pte), vma); -@@ -1937,6 +2017,8 @@ again: - flush_icache_page(vma, page); - set_pte_at(mm, address, page_table, pte); - page_add_anon_rmap(page, vma, address); -+ pb_add_ref(page, mm, &pbc); -+ ub_unused_privvm_dec(mm, vma); - - swap_free(entry); - if (vm_swap_full()) -@@ -1947,7 +2029,7 @@ again: - if (do_wp_page(mm, vma, address, - page_table, pmd, ptl, pte) == VM_FAULT_OOM) - ret = VM_FAULT_OOM; -- goto out; -+ goto out_wp; - } - - /* No need to invalidate - it was non-present before */ -@@ -1955,10 +2037,16 @@ again: - lazy_mmu_prot_update(pte); - unlock: - pte_unmap_unlock(page_table, ptl); --out: -+out_wp: -+ pb_free(&pbc); -+ spin_lock_irq(&kstat_glb_lock); -+ KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start); -+ spin_unlock_irq(&kstat_glb_lock); -+out_nostat: - return ret; - out_nomap: - pte_unmap_unlock(page_table, ptl); -+ pb_free(&pbc); - unlock_page(page); - page_cache_release(page); - return ret; -@@ -1976,11 +2064,15 @@ static int do_anonymous_page(struct mm_s - struct page *page; - spinlock_t *ptl; - pte_t entry; -+ struct page_beancounter *pbc; - - if (write_access) { - /* Allocate our own private page. */ - pte_unmap(page_table); - -+ if (unlikely(pb_alloc(&pbc))) -+ goto oom_nopb; -+ - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_zeroed_user_highpage(vma, address); -@@ -1996,7 +2088,10 @@ static int do_anonymous_page(struct mm_s - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - page_add_new_anon_rmap(page, vma, address); -+ pb_add_ref(page, mm, &pbc); - } else { -+ pbc = NULL; -+ - /* Map the ZERO_PAGE - vm_page_prot is readonly */ - page = ZERO_PAGE(address); - page_cache_get(page); -@@ -2010,18 +2105,23 @@ static int do_anonymous_page(struct mm_s - page_add_file_rmap(page); - } - -+ inc_vma_rss(vma); -+ ub_unused_privvm_dec(mm, vma); - set_pte_at(mm, address, page_table, entry); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); - unlock: -+ pb_free(&pbc); - pte_unmap_unlock(page_table, ptl); - return VM_FAULT_MINOR; - release: - page_cache_release(page); - goto unlock; - oom: -+ pb_free(&pbc); -+oom_nopb: - return VM_FAULT_OOM; - } - -@@ -2049,6 +2149,7 @@ static int do_no_page(struct mm_struct * - unsigned int sequence = 0; - int ret = VM_FAULT_MINOR; - int anon = 0; -+ struct page_beancounter *pbc; - - pte_unmap(page_table); - BUG_ON(vma->vm_flags & VM_PFNMAP); -@@ -2058,6 +2159,9 @@ static int do_no_page(struct mm_struct * - sequence = mapping->truncate_count; - smp_rmb(); /* serializes i_size against truncate_count */ - } -+ -+ if (unlikely(pb_alloc(&pbc))) -+ goto oom_nopb; - retry: - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); - /* -@@ -2070,9 +2174,9 @@ retry: - - /* no page was available -- either SIGBUS or OOM */ - if (new_page == NOPAGE_SIGBUS) -- return VM_FAULT_SIGBUS; -+ goto bus_nopg; - if (new_page == NOPAGE_OOM) -- return VM_FAULT_OOM; -+ goto oom_nopg; - - /* - * Should we do an early C-O-W break? -@@ -2131,6 +2235,9 @@ retry: - inc_mm_counter(mm, file_rss); - page_add_file_rmap(new_page); - } -+ inc_vma_rss(vma); -+ pb_add_ref(new_page, mm, &pbc); -+ ub_unused_privvm_dec(mm, vma); - } else { - /* One of our sibling threads was faster, back out. */ - page_cache_release(new_page); -@@ -2142,10 +2249,18 @@ retry: - lazy_mmu_prot_update(entry); - unlock: - pte_unmap_unlock(page_table, ptl); -+ pb_free(&pbc); - return ret; - oom: - page_cache_release(new_page); -+oom_nopg: -+ pb_free(&pbc); -+oom_nopb: - return VM_FAULT_OOM; -+ -+bus_nopg: -+ pb_free(&pbc); -+ return VM_FAULT_SIGBUS; - } - - /* -@@ -2314,6 +2429,8 @@ int __pud_alloc(struct mm_struct *mm, pg - } - #endif /* __PAGETABLE_PUD_FOLDED */ - -+EXPORT_SYMBOL_GPL(__pud_alloc); -+ - #ifndef __PAGETABLE_PMD_FOLDED - /* - * Allocate page middle directory. -@@ -2348,6 +2465,8 @@ int __pmd_alloc(struct mm_struct *mm, pu - } - #endif /* __PAGETABLE_PMD_FOLDED */ - -+EXPORT_SYMBOL_GPL(__pmd_alloc); -+ - int make_pages_present(unsigned long addr, unsigned long end) - { - int ret, len, write; -diff -upr linux-2.6.16.orig/mm/mempolicy.c linux-2.6.16-026test009/mm/mempolicy.c ---- linux-2.6.16.orig/mm/mempolicy.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/mempolicy.c 2006-04-19 15:02:12.000000000 +0400 -@@ -933,7 +933,7 @@ asmlinkage long sys_migrate_pages(pid_t - - /* Find the mm_struct */ - read_lock(&tasklist_lock); -- task = pid ? find_task_by_pid(pid) : current; -+ task = pid ? find_task_by_pid_ve(pid) : current; - if (!task) { - read_unlock(&tasklist_lock); - return -ESRCH; -diff -upr linux-2.6.16.orig/mm/mempool.c linux-2.6.16-026test009/mm/mempool.c ---- linux-2.6.16.orig/mm/mempool.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/mempool.c 2006-04-19 15:02:11.000000000 +0400 -@@ -14,6 +14,7 @@ - #include <linux/mempool.h> - #include <linux/blkdev.h> - #include <linux/writeback.h> -+#include <linux/kmem_cache.h> - - static void add_element(mempool_t *pool, void *element) - { -@@ -78,6 +79,8 @@ mempool_t *mempool_create_node(int min_n - init_waitqueue_head(&pool->wait); - pool->alloc = alloc_fn; - pool->free = free_fn; -+ if (alloc_fn == mempool_alloc_slab) -+ kmem_mark_nocharge((kmem_cache_t *)pool_data); - - /* - * First pre-allocate the guaranteed number of buffers. -@@ -119,6 +122,7 @@ int mempool_resize(mempool_t *pool, int - unsigned long flags; - - BUG_ON(new_min_nr <= 0); -+ gfp_mask &= ~__GFP_UBC; - - spin_lock_irqsave(&pool->lock, flags); - if (new_min_nr <= pool->min_nr) { -@@ -212,6 +216,7 @@ void * mempool_alloc(mempool_t *pool, gf - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ - gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ - gfp_mask |= __GFP_NOWARN; /* failures are OK */ -+ gfp_mask &= ~__GFP_UBC; - - gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); - -diff -upr linux-2.6.16.orig/mm/mlock.c linux-2.6.16-026test009/mm/mlock.c ---- linux-2.6.16.orig/mm/mlock.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/mlock.c 2006-04-19 15:02:12.000000000 +0400 -@@ -8,9 +8,11 @@ - #include <linux/capability.h> - #include <linux/mman.h> - #include <linux/mm.h> -+#include <linux/module.h> - #include <linux/mempolicy.h> - #include <linux/syscalls.h> - -+#include <ub/ub_vmpages.h> - - static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, unsigned int newflags) -@@ -25,6 +27,14 @@ static int mlock_fixup(struct vm_area_st - goto out; - } - -+ if (newflags & VM_LOCKED) { -+ ret = ub_locked_charge(mm, end - start); -+ if (ret < 0) { -+ *prev = vma; -+ goto out; -+ } -+ } -+ - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); -@@ -38,13 +48,13 @@ static int mlock_fixup(struct vm_area_st - if (start != vma->vm_start) { - ret = split_vma(mm, vma, start, 1); - if (ret) -- goto out; -+ goto out_uncharge; - } - - if (end != vma->vm_end) { - ret = split_vma(mm, vma, end, 0); - if (ret) -- goto out; -+ goto out_uncharge; - } - - success: -@@ -63,13 +73,19 @@ success: - pages = -pages; - if (!(newflags & VM_IO)) - ret = make_pages_present(start, end); -- } -+ } else -+ ub_locked_uncharge(mm, end - start); - - vma->vm_mm->locked_vm -= pages; - out: - if (ret == -ENOMEM) - ret = -EAGAIN; - return ret; -+ -+out_uncharge: -+ if (newflags & VM_LOCKED) -+ ub_locked_uncharge(mm, end - start); -+ goto out; - } - - static int do_mlock(unsigned long start, size_t len, int on) -@@ -146,6 +162,7 @@ asmlinkage long sys_mlock(unsigned long - up_write(¤t->mm->mmap_sem); - return error; - } -+EXPORT_SYMBOL_GPL(sys_mlock); - - asmlinkage long sys_munlock(unsigned long start, size_t len) - { -@@ -158,6 +175,7 @@ asmlinkage long sys_munlock(unsigned lon - up_write(¤t->mm->mmap_sem); - return ret; - } -+EXPORT_SYMBOL_GPL(sys_munlock); - - static int do_mlockall(int flags) - { -diff -upr linux-2.6.16.orig/mm/mmap.c linux-2.6.16-026test009/mm/mmap.c ---- linux-2.6.16.orig/mm/mmap.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/mmap.c 2006-04-19 15:02:12.000000000 +0400 -@@ -25,14 +25,18 @@ - #include <linux/mount.h> - #include <linux/mempolicy.h> - #include <linux/rmap.h> -+#include <linux/virtinfo.h> - - #include <asm/uaccess.h> - #include <asm/cacheflush.h> - #include <asm/tlb.h> - -+#include <ub/ub_vmpages.h> -+ - static void unmap_region(struct mm_struct *mm, - struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long start, unsigned long end); -+static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft); - - /* - * WARNING: the debugging will use recursive algorithms so never enable this -@@ -87,6 +91,16 @@ int __vm_enough_memory(long pages, int c - - vm_acct_memory(pages); - -+ switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM, -+ (void *)pages) -+ & (NOTIFY_OK | NOTIFY_FAIL)) { -+ case NOTIFY_OK: -+ return 0; -+ case NOTIFY_FAIL: -+ vm_unacct_memory(pages); -+ return -ENOMEM; -+ } -+ - /* - * Sometimes we want to use more memory than we have - */ -@@ -201,11 +215,16 @@ static struct vm_area_struct *remove_vma - struct vm_area_struct *next = vma->vm_next; - - might_sleep(); -+ -+ ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start, -+ vma->vm_flags, vma->vm_file); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); - mpol_free(vma_policy(vma)); -+ if (get_vma_rss(vma)) -+ warn_bad_rss(vma, 0); - kmem_cache_free(vm_area_cachep, vma); - return next; - } -@@ -242,7 +261,7 @@ asmlinkage unsigned long sys_brk(unsigne - goto out; - - /* Ok, looks good - let it rip. */ -- if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) -+ if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk) - goto out; - set_brk: - mm->brk = brk; -@@ -726,7 +745,7 @@ struct vm_area_struct *vma_merge(struct - else - next = mm->mmap; - area = next; -- if (next && next->vm_end == end) /* cases 6, 7, 8 */ -+ if (next && next->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; - - /* -@@ -746,11 +765,22 @@ struct vm_area_struct *vma_merge(struct - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma)) { - /* cases 1, 6 */ -+ add_vma_rss(prev, get_vma_rss(next)); -+ if (area != next) /* case 6 */ -+ add_vma_rss(prev, get_vma_rss(area)); - vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL); -- } else /* cases 2, 5, 7 */ -+ } else { /* cases 2, 5, 7 */ -+ if (next && addr == next->vm_start) { /* case 5 */ -+ unsigned long rss; -+ rss = pages_in_vma_range(next, addr, end); -+ sub_vma_rss(next, rss); -+ add_vma_rss(prev, rss); -+ } else if (area != next) /* case 7 */ -+ add_vma_rss(prev, get_vma_rss(area)); - vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL); -+ } - return prev; - } - -@@ -761,12 +791,19 @@ struct vm_area_struct *vma_merge(struct - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { -- if (prev && addr < prev->vm_end) /* case 4 */ -+ if (prev && addr < prev->vm_end) { /* case 4 */ -+ unsigned long rss; -+ rss = pages_in_vma_range(prev, addr, end); -+ sub_vma_rss(prev, rss); -+ add_vma_rss(next, rss); - vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL); -- else /* cases 3, 8 */ -+ } else { /* cases 3, 8 */ -+ if (area != next) /* case 8 */ -+ add_vma_rss(area, get_vma_rss(next)); - vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL); -+ } - return area; - } - -@@ -1033,6 +1070,10 @@ munmap_back: - } - } - -+ if (ub_memory_charge(mm, len, vm_flags, file, -+ (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) -+ goto charge_error; -+ - /* - * Can we just expand an old private anonymous mapping? - * The VM_SHARED test is necessary because shmem_zero_setup -@@ -1048,7 +1089,8 @@ munmap_back: - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ -- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); -+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | -+ (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0)); - if (!vma) { - error = -ENOMEM; - goto unacct_error; -@@ -1142,6 +1184,8 @@ unmap_and_free_vma: - free_vma: - kmem_cache_free(vm_area_cachep, vma); - unacct_error: -+ ub_memory_uncharge(mm, len, vm_flags, file); -+charge_error: - if (charged) - vm_unacct_memory(charged); - return error; -@@ -1471,12 +1515,16 @@ static int acct_stack_growth(struct vm_a - return -ENOMEM; - } - -+ if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags, -+ vma->vm_file, UB_SOFT)) -+ goto fail_charge; -+ - /* - * Overcommit.. This must be the final test, as it will - * update security statistics. - */ - if (security_vm_enough_memory(grow)) -- return -ENOMEM; -+ goto fail_sec; - - /* Ok, everything looks good - let it rip */ - mm->total_vm += grow; -@@ -1484,6 +1532,11 @@ static int acct_stack_growth(struct vm_a - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); - return 0; -+ -+fail_sec: -+ ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file); -+fail_charge: -+ return -ENOMEM; - } - - #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) -@@ -1744,8 +1797,13 @@ int split_vma(struct mm_struct * mm, str - else - vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - -+ /* protected with mmap sem */ -+ set_vma_rss(vma, pages_in_vma(vma)); -+ set_vma_rss(new, pages_in_vma(new)); -+ - return 0; - } -+EXPORT_SYMBOL_GPL(split_vma); - - /* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the -@@ -1839,7 +1897,7 @@ static inline void verify_mm_writelocked - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. - */ --unsigned long do_brk(unsigned long addr, unsigned long len) -+static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft) - { - struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; -@@ -1891,11 +1949,14 @@ unsigned long do_brk(unsigned long addr, - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - -- if (security_vm_enough_memory(len >> PAGE_SHIFT)) -- return -ENOMEM; -- - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - -+ if (ub_memory_charge(mm, len, flags, NULL, soft)) -+ goto fail_charge; -+ -+ if (security_vm_enough_memory(len >> PAGE_SHIFT)) -+ goto fail_sec; -+ - /* Can we just expand an old private anonymous mapping? */ - if (vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL)) -@@ -1904,11 +1965,11 @@ unsigned long do_brk(unsigned long addr, - /* - * create a vma struct for an anonymous mapping - */ -- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); -- if (!vma) { -- vm_unacct_memory(len >> PAGE_SHIFT); -- return -ENOMEM; -- } -+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | -+ (soft == UB_SOFT ? __GFP_SOFT_UBC : 0)); -+ if (!vma) -+ goto fail_alloc; -+ - memset(vma, 0, sizeof(*vma)); - - vma->vm_mm = mm; -@@ -1925,8 +1986,19 @@ out: - make_pages_present(addr, addr + len); - } - return addr; -+ -+fail_alloc: -+ vm_unacct_memory(len >> PAGE_SHIFT); -+fail_sec: -+ ub_memory_uncharge(mm, len, flags, NULL); -+fail_charge: -+ return -ENOMEM; - } - -+unsigned long do_brk(unsigned long addr, unsigned long len) -+{ -+ return __do_brk(addr, len, UB_SOFT); -+} - EXPORT_SYMBOL(do_brk); - - /* Release all mmaps. */ -@@ -2036,6 +2108,7 @@ struct vm_area_struct *copy_vma(struct v - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; -+ set_vma_rss(new_vma, 0); - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) -diff -upr linux-2.6.16.orig/mm/mprotect.c linux-2.6.16-026test009/mm/mprotect.c ---- linux-2.6.16.orig/mm/mprotect.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/mprotect.c 2006-04-19 15:02:12.000000000 +0400 -@@ -9,6 +9,7 @@ - */ - - #include <linux/mm.h> -+#include <linux/module.h> - #include <linux/hugetlb.h> - #include <linux/slab.h> - #include <linux/shm.h> -@@ -25,6 +26,8 @@ - #include <asm/cacheflush.h> - #include <asm/tlbflush.h> - -+#include <ub/ub_vmpages.h> -+ - static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot) - { -@@ -109,12 +112,20 @@ mprotect_fixup(struct vm_area_struct *vm - pgprot_t newprot; - pgoff_t pgoff; - int error; -+ unsigned long ch_size; -+ int ch_dir; - - if (newflags == oldflags) { - *pprev = vma; - return 0; - } - -+ error = -ENOMEM; -+ ch_size = nrpages - pages_in_vma_range(vma, start, end); -+ ch_dir = ub_protected_charge(mm, ch_size, newflags, vma); -+ if (ch_dir == PRIVVM_ERROR) -+ goto fail_ch; -+ - /* - * If we make a private mapping writable we increase our commit; - * but (without finer accounting) cannot reduce our commit if we -@@ -127,7 +138,7 @@ mprotect_fixup(struct vm_area_struct *vm - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { - charged = nrpages; - if (security_vm_enough_memory(charged)) -- return -ENOMEM; -+ goto fail_sec; - newflags |= VM_ACCOUNT; - } - } -@@ -169,10 +180,16 @@ success: - change_protection(vma, start, end, newprot); - vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); - vm_stat_account(mm, newflags, vma->vm_file, nrpages); -+ if (ch_dir == PRIVVM_TO_SHARED) -+ __ub_unused_privvm_dec(mm, ch_size); - return 0; - - fail: - vm_unacct_memory(charged); -+fail_sec: -+ if (ch_dir == PRIVVM_TO_PRIVATE) -+ __ub_unused_privvm_dec(mm, ch_size); -+fail_ch: - return error; - } - -@@ -280,3 +297,4 @@ out: - up_write(¤t->mm->mmap_sem); - return error; - } -+EXPORT_SYMBOL_GPL(sys_mprotect); -diff -upr linux-2.6.16.orig/mm/mremap.c linux-2.6.16-026test009/mm/mremap.c ---- linux-2.6.16.orig/mm/mremap.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/mremap.c 2006-04-19 15:02:12.000000000 +0400 -@@ -23,6 +23,8 @@ - #include <asm/cacheflush.h> - #include <asm/tlbflush.h> - -+#include <ub/ub_vmpages.h> -+ - static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) - { - pgd_t *pgd; -@@ -106,6 +108,8 @@ static void move_ptes(struct vm_area_str - pte = ptep_clear_flush(vma, old_addr, old_pte); - /* ZERO_PAGE can be dependant on virtual addr */ - pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); -+ dec_vma_rss(vma); -+ inc_vma_rss(new_vma); - set_pte_at(mm, new_addr, new_pte, pte); - } - -@@ -166,17 +170,21 @@ static unsigned long move_vma(struct vm_ - unsigned long hiwater_vm; - int split = 0; - -+ if (ub_memory_charge(mm, new_len, vm_flags, -+ vma->vm_file, UB_HARD)) -+ goto err; -+ - /* - * We'd prefer to avoid failure later on in do_munmap: - * which may split one vma into three before unmapping. - */ - if (mm->map_count >= sysctl_max_map_count - 3) -- return -ENOMEM; -+ goto err_nomem; - - new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); - if (!new_vma) -- return -ENOMEM; -+ goto err_nomem; - - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); - if (moved_len < old_len) { -@@ -235,7 +243,13 @@ static unsigned long move_vma(struct vm_ - new_addr + new_len); - } - -- return new_addr; -+ if (new_addr != -ENOMEM) -+ return new_addr; -+ -+err_nomem: -+ ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file); -+err: -+ return -ENOMEM; - } - - /* -@@ -361,6 +375,11 @@ unsigned long do_mremap(unsigned long ad - if (max_addr - addr >= new_len) { - int pages = (new_len - old_len) >> PAGE_SHIFT; - -+ ret = -ENOMEM; -+ if (ub_memory_charge(mm, new_len, vma->vm_flags, -+ vma->vm_file, UB_HARD)) -+ goto out; -+ - vma_adjust(vma, vma->vm_start, - addr + new_len, vma->vm_pgoff, NULL); - -diff -upr linux-2.6.16.orig/mm/oom_kill.c linux-2.6.16-026test009/mm/oom_kill.c ---- linux-2.6.16.orig/mm/oom_kill.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/oom_kill.c 2006-04-19 15:02:12.000000000 +0400 -@@ -176,7 +176,7 @@ static struct task_struct *select_bad_pr - *ppoints = 0; - - do_posix_clock_monotonic_gettime(&uptime); -- do_each_thread(g, p) { -+ do_each_thread_all(g, p) { - unsigned long points; - int releasing; - -@@ -205,7 +205,7 @@ static struct task_struct *select_bad_pr - chosen = p; - *ppoints = points; - } -- } while_each_thread(g, p); -+ } while_each_thread_all(g, p); - return chosen; - } - -@@ -261,10 +261,10 @@ static struct mm_struct *oom_kill_task(t - * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group - */ -- do_each_thread(g, q) -+ do_each_thread_all(g, q) { - if (q->mm == mm && q->tgid != p->tgid) - __oom_kill_task(q, message); -- while_each_thread(g, q); -+ } while_each_thread_all(g, q); - - return mm; - } -diff -upr linux-2.6.16.orig/mm/page_alloc.c linux-2.6.16-026test009/mm/page_alloc.c ---- linux-2.6.16.orig/mm/page_alloc.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/page_alloc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -41,6 +41,8 @@ - #include <asm/tlbflush.h> - #include "internal.h" - -+#include <ub/ub_mem.h> -+ - /* - * MCD - HACK: Find somewhere to initialize this EARLY, or make this - * initializer cleaner -@@ -50,6 +52,7 @@ EXPORT_SYMBOL(node_online_map); - nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; - EXPORT_SYMBOL(node_possible_map); - struct pglist_data *pgdat_list __read_mostly; -+EXPORT_SYMBOL(pgdat_list); - unsigned long totalram_pages __read_mostly; - unsigned long totalhigh_pages __read_mostly; - long nr_swap_pages; -@@ -153,7 +156,8 @@ static void bad_page(struct page *page) - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_swapcache | -- 1 << PG_writeback ); -+ 1 << PG_writeback | -+ 1 << PG_buddy ); - set_page_count(page, 0); - reset_page_mapcount(page); - page->mapping = NULL; -@@ -224,12 +228,12 @@ static inline unsigned long page_order(s - - static inline void set_page_order(struct page *page, int order) { - set_page_private(page, order); -- __SetPagePrivate(page); -+ __SetPageBuddy(page); - } - - static inline void rmv_page_order(struct page *page) - { -- __ClearPagePrivate(page); -+ __ClearPageBuddy(page); - set_page_private(page, 0); - } - -@@ -268,11 +272,13 @@ __find_combined_index(unsigned long page - * This function checks whether a page is free && is the buddy - * we can do coalesce a page and its buddy if - * (a) the buddy is not in a hole && -- * (b) the buddy is free && -- * (c) the buddy is on the buddy system && -- * (d) a page and its buddy have the same order. -- * for recording page's order, we use page_private(page) and PG_private. -+ * (b) the buddy is in the buddy system && -+ * (c) a page and its buddy have the same order. - * -+ * For recording whether a page is in the buddy system, we use PG_buddy. -+ * Setting, clearing, and testing PG_buddy is serialized by zone->lock. -+ * -+ * For recording page's order, we use page_private(page). - */ - static inline int page_is_buddy(struct page *page, int order) - { -@@ -281,10 +287,10 @@ static inline int page_is_buddy(struct p - return 0; - #endif - -- if (PagePrivate(page) && -- (page_order(page) == order) && -- page_count(page) == 0) -+ if (PageBuddy(page) && page_order(page) == order) { -+ BUG_ON(page_count(page) != 0); - return 1; -+ } - return 0; - } - -@@ -301,7 +307,7 @@ static inline int page_is_buddy(struct p - * as necessary, plus some accounting needed to play nicely with other - * parts of the VM system. - * At each level, we keep a list of pages, which are heads of continuous -- * free pages of length of (1 << order) and marked with PG_Private.Page's -+ * free pages of length of (1 << order) and marked with PG_buddy. Page's - * order is recorded in page_private(page) field. - * So when we are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were -@@ -364,7 +370,8 @@ static inline int free_pages_check(struc - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | -- 1 << PG_reserved )))) -+ 1 << PG_reserved | -+ 1 << PG_buddy )))) - bad_page(page); - if (PageDirty(page)) - __ClearPageDirty(page); -@@ -434,6 +441,7 @@ static void __free_pages_ok(struct page - return; - - kernel_map_pages(page, 1 << order, 0); -+ ub_page_uncharge(page, order); - local_irq_save(flags); - __mod_page_state(pgfree, 1 << order); - free_one_page(page_zone(page), page, order); -@@ -522,7 +530,8 @@ static int prep_new_page(struct page *pa - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | -- 1 << PG_reserved )))) -+ 1 << PG_reserved | -+ 1 << PG_buddy )))) - bad_page(page); - - /* -@@ -721,6 +730,7 @@ static void fastcall free_hot_cold_page( - kernel_map_pages(page, 1, 0); - - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; -+ ub_page_uncharge(page, 0); - local_irq_save(flags); - __inc_page_state(pgfree); - list_add(&page->lru, &pcp->list); -@@ -894,6 +904,26 @@ get_page_from_freelist(gfp_t gfp_mask, u - return page; - } - -+static void __alloc_collect_stats(unsigned int gfp_mask, -+ unsigned int order, struct page *page, cycles_t time) -+{ -+ int ind; -+ unsigned long flags; -+ -+ time = get_cycles() - time; -+ if (!(gfp_mask & __GFP_WAIT)) -+ ind = 0; -+ else if (!(gfp_mask & __GFP_HIGHMEM)) -+ ind = (order > 0 ? 2 : 1); -+ else -+ ind = (order > 0 ? 4 : 3); -+ spin_lock_irqsave(&kstat_glb_lock, flags); -+ KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time); -+ if (!page) -+ kstat_glob.alloc_fails[ind]++; -+ spin_unlock_irqrestore(&kstat_glb_lock, flags); -+} -+ - /* - * This is the 'heart' of the zoned buddy allocator. - */ -@@ -909,6 +939,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned i - int do_retry; - int alloc_flags; - int did_some_progress; -+ cycles_t start; - - might_sleep_if(wait); - -@@ -920,6 +951,7 @@ restart: - return NULL; - } - -+ start = get_cycles(); - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); - if (page) -@@ -1038,6 +1070,7 @@ rebalance: - } - - nopage: -+ __alloc_collect_stats(gfp_mask, order, page, start); - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - printk(KERN_WARNING "%s: page allocation failure." - " order:%d, mode:0x%x\n", -@@ -1045,7 +1078,13 @@ nopage: - dump_stack(); - show_mem(); - } -+ return NULL; -+ - got_pg: -+ if (ub_page_charge(page, order, gfp_mask)) { -+ __free_pages(page, order); -+ page = NULL; -+ } - return page; - } - -@@ -2378,7 +2417,10 @@ static void *vmstat_start(struct seq_fil - m->private = ps; - if (!ps) - return ERR_PTR(-ENOMEM); -- get_full_page_state(ps); -+ if (ve_is_super(get_exec_env())) -+ get_full_page_state(ps); -+ else -+ memset(ps, 0, sizeof(*ps)); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; - return (unsigned long *)ps + *pos; -diff -upr linux-2.6.16.orig/mm/rmap.c linux-2.6.16-026test009/mm/rmap.c ---- linux-2.6.16.orig/mm/rmap.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/rmap.c 2006-04-19 15:02:12.000000000 +0400 -@@ -56,6 +56,8 @@ - - #include <asm/tlbflush.h> - -+#include <ub/ub_vmpages.h> -+ - //#define RMAP_DEBUG /* can be enabled only for debugging */ - - kmem_cache_t *anon_vma_cachep; -@@ -117,6 +119,7 @@ int anon_vma_prepare(struct vm_area_stru - } - return 0; - } -+EXPORT_SYMBOL_GPL(anon_vma_prepare); - - void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) - { -@@ -145,6 +148,7 @@ void anon_vma_link(struct vm_area_struct - spin_unlock(&anon_vma->lock); - } - } -+EXPORT_SYMBOL_GPL(anon_vma_link); - - void anon_vma_unlink(struct vm_area_struct *vma) - { -@@ -180,14 +184,15 @@ static void anon_vma_ctor(void *data, km - void __init anon_vma_init(void) - { - anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), -- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); -+ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, -+ anon_vma_ctor, NULL); - } - - /* - * Getting a lock on a stable anon_vma from a page off the LRU is - * tricky: page_lock_anon_vma rely on RCU to guard against the races. - */ --static struct anon_vma *page_lock_anon_vma(struct page *page) -+struct anon_vma *page_lock_anon_vma(struct page *page) - { - struct anon_vma *anon_vma = NULL; - unsigned long anon_mapping; -@@ -205,6 +210,7 @@ out: - rcu_read_unlock(); - return anon_vma; - } -+EXPORT_SYMBOL_GPL(page_lock_anon_vma); - - #ifdef CONFIG_MIGRATION - /* -@@ -220,6 +226,7 @@ void remove_from_swap(struct page *page) - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned long mapping; -+ struct page_beancounter *pb; - - if (!PageSwapCache(page)) - return; -@@ -229,6 +236,10 @@ void remove_from_swap(struct page *page) - if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) - return; - -+ pb = NULL; -+ if (pb_alloc_all(&pb)) -+ return; -+ - /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. - */ -@@ -236,10 +247,12 @@ void remove_from_swap(struct page *page) - spin_lock(&anon_vma->lock); - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) -- remove_vma_swap(vma, page); -+ remove_vma_swap(vma, page, &pb); - - spin_unlock(&anon_vma->lock); - delete_from_swap_cache(page); -+ -+ pb_free_list(&pb); - } - EXPORT_SYMBOL(remove_from_swap); - #endif -@@ -638,7 +651,11 @@ static int try_to_unmap_one(struct page - } else - dec_mm_counter(mm, file_rss); - -+ dec_vma_rss(vma); - page_remove_rmap(page); -+ ub_unused_privvm_inc(mm, vma); -+ ub_unmap_inc(mm); -+ pb_remove_ref(page, mm); - page_cache_release(page); - - out_unmap: -@@ -729,8 +746,12 @@ static void try_to_unmap_cluster(unsigne - set_page_dirty(page); - - page_remove_rmap(page); -+ ub_unmap_inc(mm); -+ pb_remove_ref(page, mm); -+ ub_unused_privvm_inc(mm, vma); - page_cache_release(page); - dec_mm_counter(mm, file_rss); -+ dec_vma_rss(vma); - (*mapcount)--; - } - pte_unmap_unlock(pte - 1, ptl); -diff -upr linux-2.6.16.orig/mm/shmem.c linux-2.6.16-026test009/mm/shmem.c ---- linux-2.6.16.orig/mm/shmem.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/shmem.c 2006-04-19 15:02:12.000000000 +0400 -@@ -50,6 +50,8 @@ - #include <asm/div64.h> - #include <asm/pgtable.h> - -+#include <ub/ub_vmpages.h> -+ - /* This magic number is used in glibc for posix shared memory */ - #define TMPFS_MAGIC 0x01021994 - -@@ -211,7 +213,7 @@ static void shmem_free_blocks(struct ino - * - * It has to be called with the spinlock held. - */ --static void shmem_recalc_inode(struct inode *inode) -+static void shmem_recalc_inode(struct inode *inode, long swp_freed) - { - struct shmem_inode_info *info = SHMEM_I(inode); - long freed; -@@ -221,6 +223,8 @@ static void shmem_recalc_inode(struct in - info->alloced -= freed; - shmem_unacct_blocks(info->flags, freed); - shmem_free_blocks(inode, freed); -+ if (freed > swp_freed) -+ ub_tmpfs_respages_sub(info, freed - swp_freed); - } - } - -@@ -326,6 +330,11 @@ static void shmem_swp_set(struct shmem_i - struct page *page = kmap_atomic_to_page(entry); - set_page_private(page, page_private(page) + incdec); - } -+ -+ if (incdec == 1) -+ ub_tmpfs_respages_dec(info); -+ else -+ ub_tmpfs_respages_inc(info); - } - - /* -@@ -342,14 +351,24 @@ static swp_entry_t *shmem_swp_alloc(stru - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - struct page *page = NULL; - swp_entry_t *entry; -+ unsigned long ub_val; - - if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - return ERR_PTR(-EINVAL); - -+ ub_val = 0; -+ if (info->next_index <= index) { -+ ub_val = index + 1 - info->next_index; -+ if (ub_shmpages_charge(info, ub_val)) -+ return ERR_PTR(-ENOSPC); -+ } -+ - while (!(entry = shmem_swp_entry(info, index, &page))) { -- if (sgp == SGP_READ) -- return shmem_swp_map(ZERO_PAGE(0)); -+ if (sgp == SGP_READ) { -+ entry = shmem_swp_map(ZERO_PAGE(0)); -+ goto out; -+ } - /* - * Test free_blocks against 1 not 0, since we have 1 data - * page (and perhaps indirect index pages) yet to allocate: -@@ -359,7 +378,8 @@ static swp_entry_t *shmem_swp_alloc(stru - spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks <= 1) { - spin_unlock(&sbinfo->stat_lock); -- return ERR_PTR(-ENOSPC); -+ entry = ERR_PTR(-ENOSPC); -+ goto out; - } - sbinfo->free_blocks--; - inode->i_blocks += BLOCKS_PER_PAGE; -@@ -367,31 +387,43 @@ static swp_entry_t *shmem_swp_alloc(stru - } - - spin_unlock(&info->lock); -- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); -+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | -+ __GFP_ZERO | __GFP_UBC); - if (page) - set_page_private(page, 0); - spin_lock(&info->lock); - - if (!page) { -- shmem_free_blocks(inode, 1); -- return ERR_PTR(-ENOMEM); -+ entry = ERR_PTR(-ENOMEM); -+ goto out_block; - } - if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { - entry = ERR_PTR(-EINVAL); -- break; -+ goto out_dir; - } -- if (info->next_index <= index) -+ if (info->next_index <= index) { -+ ub_val = 0; - info->next_index = index + 1; -+ } - } - if (page) { - /* another task gave its page, or truncated the file */ - shmem_free_blocks(inode, 1); - shmem_dir_free(page); - } -- if (info->next_index <= index && !IS_ERR(entry)) -+ if (info->next_index <= index) - info->next_index = index + 1; - return entry; -+ -+out_dir: -+ shmem_dir_free(page); -+out_block: -+ shmem_free_blocks(inode, 1); -+out: -+ if (ub_val) -+ ub_shmpages_uncharge(info, ub_val); -+ return entry; - } - - /* -@@ -484,6 +516,7 @@ static void shmem_truncate_range(struct - return; - - spin_lock(&info->lock); -+ ub_shmpages_uncharge(info, info->next_index - idx); - info->flags |= SHMEM_TRUNCATE; - if (likely(end == (loff_t) -1)) { - limit = info->next_index; -@@ -613,7 +646,7 @@ done2: - info->swapped -= nr_swaps_freed; - if (nr_pages_to_free) - shmem_free_blocks(inode, nr_pages_to_free); -- shmem_recalc_inode(inode); -+ shmem_recalc_inode(inode, nr_swaps_freed); - spin_unlock(&info->lock); - - /* -@@ -696,6 +729,7 @@ static void shmem_delete_inode(struct in - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } -+ shmi_ub_put(info); - clear_inode(inode); - } - -@@ -817,6 +851,12 @@ int shmem_unuse(swp_entry_t entry, struc - return found; - } - -+#ifdef CONFIG_USER_RESOURCE -+#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub)) -+#else -+#define shm_get_swap_page(info) (get_swap_page(NULL)) -+#endif -+ - /* - * Move the page from the page cache to the swap cache. - */ -@@ -837,12 +877,12 @@ static int shmem_writepage(struct page * - info = SHMEM_I(inode); - if (info->flags & VM_LOCKED) - goto redirty; -- swap = get_swap_page(); -+ swap = shm_get_swap_page(info); - if (!swap.val) - goto redirty; - - spin_lock(&info->lock); -- shmem_recalc_inode(inode); -+ shmem_recalc_inode(inode, 0); - if (index >= info->next_index) { - BUG_ON(!(info->flags & SHMEM_TRUNCATE)); - goto unlock; -@@ -1030,7 +1070,7 @@ repeat: - goto failed; - - spin_lock(&info->lock); -- shmem_recalc_inode(inode); -+ shmem_recalc_inode(inode, 0); - entry = shmem_swp_alloc(info, idx, sgp); - if (IS_ERR(entry)) { - spin_unlock(&info->lock); -@@ -1206,6 +1246,7 @@ repeat: - spin_unlock(&info->lock); - flush_dcache_page(filepage); - SetPageUptodate(filepage); -+ ub_tmpfs_respages_inc(info); - } - done: - if (*pagep != filepage) { -@@ -1307,28 +1348,6 @@ shmem_get_policy(struct vm_area_struct * - } - #endif - --int shmem_lock(struct file *file, int lock, struct user_struct *user) --{ -- struct inode *inode = file->f_dentry->d_inode; -- struct shmem_inode_info *info = SHMEM_I(inode); -- int retval = -ENOMEM; -- -- spin_lock(&info->lock); -- if (lock && !(info->flags & VM_LOCKED)) { -- if (!user_shm_lock(inode->i_size, user)) -- goto out_nomem; -- info->flags |= VM_LOCKED; -- } -- if (!lock && (info->flags & VM_LOCKED) && user) { -- user_shm_unlock(inode->i_size, user); -- info->flags &= ~VM_LOCKED; -- } -- retval = 0; --out_nomem: -- spin_unlock(&info->lock); -- return retval; --} -- - int shmem_mmap(struct file *file, struct vm_area_struct *vma) - { - file_accessed(file); -@@ -1365,6 +1384,7 @@ shmem_get_inode(struct super_block *sb, - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - info = SHMEM_I(inode); - memset(info, 0, (char *)inode - (char *)info); -+ shmi_ub_set(info, get_exec_ub()); - spin_lock_init(&info->lock); - INIT_LIST_HEAD(&info->swaplist); - -@@ -2226,6 +2246,10 @@ static struct vm_operations_struct shmem - #endif - }; - -+int is_shmem_mapping(struct address_space *map) -+{ -+ return (map != NULL && map->a_ops == &shmem_aops); -+} - - static struct super_block *shmem_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -@@ -2233,13 +2257,19 @@ static struct super_block *shmem_get_sb( - return get_sb_nodev(fs_type, flags, data, shmem_fill_super); - } - --static struct file_system_type tmpfs_fs_type = { -+struct file_system_type tmpfs_fs_type = { - .owner = THIS_MODULE, - .name = "tmpfs", - .get_sb = shmem_get_sb, - .kill_sb = kill_litter_super, - }; -+EXPORT_SYMBOL(tmpfs_fs_type); -+ -+#ifdef CONFIG_VE -+#define shm_mnt (get_exec_env()->shmem_mnt) -+#else - static struct vfsmount *shm_mnt; -+#endif - - static int __init init_tmpfs(void) - { -@@ -2276,6 +2306,36 @@ out3: - } - module_init(init_tmpfs) - -+static inline int shm_charge_ahead(struct inode *inode) -+{ -+#ifdef CONFIG_USER_RESOURCE -+ struct shmem_inode_info *info = SHMEM_I(inode); -+ unsigned long idx; -+ swp_entry_t *entry; -+ -+ if (!inode->i_size) -+ return 0; -+ idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; -+ /* -+ * Just touch info to allocate space for entry and -+ * make all UBC checks -+ */ -+ spin_lock(&info->lock); -+ entry = shmem_swp_alloc(info, idx, SGP_CACHE); -+ if (IS_ERR(entry)) -+ goto err; -+ shmem_swp_unmap(entry); -+ spin_unlock(&info->lock); -+ return 0; -+ -+err: -+ spin_unlock(&info->lock); -+ return PTR_ERR(entry); -+#else -+ return 0; -+#endif -+} -+ - /* - * shmem_file_setup - get an unlinked file living in tmpfs - * -@@ -2323,6 +2383,10 @@ struct file *shmem_file_setup(char *name - d_instantiate(dentry, inode); - inode->i_size = size; - inode->i_nlink = 0; /* It is unlinked */ -+ error = shm_charge_ahead(inode); -+ if (error) -+ goto close_file; -+ - file->f_vfsmnt = mntget(shm_mnt); - file->f_dentry = dentry; - file->f_mapping = inode->i_mapping; -@@ -2338,6 +2402,7 @@ put_memory: - shmem_unacct_size(flags, size); - return ERR_PTR(error); - } -+EXPORT_SYMBOL_GPL(shmem_file_setup); - - /* - * shmem_zero_setup - setup a shared anonymous mapping -@@ -2355,6 +2420,8 @@ int shmem_zero_setup(struct vm_area_stru - - if (vma->vm_file) - fput(vma->vm_file); -+ else if (vma->vm_flags & VM_WRITE) -+ __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT); - vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; - return 0; -diff -upr linux-2.6.16.orig/mm/slab.c linux-2.6.16-026test009/mm/slab.c ---- linux-2.6.16.orig/mm/slab.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/slab.c 2006-04-19 15:02:12.000000000 +0400 -@@ -105,32 +105,19 @@ - #include <linux/nodemask.h> - #include <linux/mempolicy.h> - #include <linux/mutex.h> -+#include <linux/kmem_slab.h> -+#include <linux/kmem_cache.h> - - #include <asm/uaccess.h> - #include <asm/cacheflush.h> - #include <asm/tlbflush.h> - #include <asm/page.h> - --/* -- * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, -- * SLAB_RED_ZONE & SLAB_POISON. -- * 0 for faster, smaller code (especially in the critical paths). -- * -- * STATS - 1 to collect stats for /proc/slabinfo. -- * 0 for faster, smaller code (especially in the critical paths). -- * -- * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) -- */ -+#include <ub/ub_mem.h> - --#ifdef CONFIG_DEBUG_SLAB --#define DEBUG 1 --#define STATS 1 --#define FORCED_DEBUG 1 --#else --#define DEBUG 0 --#define STATS 0 --#define FORCED_DEBUG 0 --#endif -+#define DEBUG SLAB_DEBUG -+#define STATS SLAB_STATS -+#define FORCED_DEBUG SLAB_FORCED_DEBUG - - /* Shouldn't this be in a header file somewhere? */ - #define BYTES_PER_WORD sizeof(void *) -@@ -173,134 +160,20 @@ - SLAB_NO_REAP | SLAB_CACHE_DMA | \ - SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ -- SLAB_DESTROY_BY_RCU) -+ SLAB_DESTROY_BY_RCU | SLAB_UBC | SLAB_NO_CHARGE) - #else - # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ - SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ -- SLAB_DESTROY_BY_RCU) -+ SLAB_DESTROY_BY_RCU | SLAB_UBC | SLAB_NO_CHARGE) - #endif - --/* -- * kmem_bufctl_t: -- * -- * Bufctl's are used for linking objs within a slab -- * linked offsets. -- * -- * This implementation relies on "struct page" for locating the cache & -- * slab an object belongs to. -- * This allows the bufctl structure to be small (one int), but limits -- * the number of objects a slab (not a cache) can contain when off-slab -- * bufctls are used. The limit is the size of the largest general cache -- * that does not use off-slab slabs. -- * For 32bit archs with 4 kB pages, is this 56. -- * This is not serious, as it is only for large objects, when it is unwise -- * to have too many per slab. -- * Note: This limit can be raised by introducing a general cache whose size -- * is less than 512 (PAGE_SIZE<<3), but greater than 256. -- */ -- --typedef unsigned int kmem_bufctl_t; --#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) --#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) --#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) -- - /* Max number of objs-per-slab for caches which use off-slab slabs. - * Needed to avoid a possible looping condition in cache_grow(). - */ - static unsigned long offslab_limit; - - /* -- * struct slab -- * -- * Manages the objs in a slab. Placed either at the beginning of mem allocated -- * for a slab, or allocated from an general cache. -- * Slabs are chained into three list: fully used, partial, fully free slabs. -- */ --struct slab { -- struct list_head list; -- unsigned long colouroff; -- void *s_mem; /* including colour offset */ -- unsigned int inuse; /* num of objs active in slab */ -- kmem_bufctl_t free; -- unsigned short nodeid; --}; -- --/* -- * struct slab_rcu -- * -- * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to -- * arrange for kmem_freepages to be called via RCU. This is useful if -- * we need to approach a kernel structure obliquely, from its address -- * obtained without the usual locking. We can lock the structure to -- * stabilize it and check it's still at the given address, only if we -- * can be sure that the memory has not been meanwhile reused for some -- * other kind of object (which our subsystem's lock might corrupt). -- * -- * rcu_read_lock before reading the address, then rcu_read_unlock after -- * taking the spinlock within the structure expected at that address. -- * -- * We assume struct slab_rcu can overlay struct slab when destroying. -- */ --struct slab_rcu { -- struct rcu_head head; -- struct kmem_cache *cachep; -- void *addr; --}; -- --/* -- * struct array_cache -- * -- * Purpose: -- * - LIFO ordering, to hand out cache-warm objects from _alloc -- * - reduce the number of linked list operations -- * - reduce spinlock operations -- * -- * The limit is stored in the per-cpu structure to reduce the data cache -- * footprint. -- * -- */ --struct array_cache { -- unsigned int avail; -- unsigned int limit; -- unsigned int batchcount; -- unsigned int touched; -- spinlock_t lock; -- void *entry[0]; /* -- * Must have this definition in here for the proper -- * alignment of array_cache. Also simplifies accessing -- * the entries. -- * [0] is for gcc 2.95. It should really be []. -- */ --}; -- --/* bootstrap: The caches do not work without cpuarrays anymore, -- * but the cpuarrays are allocated from the generic caches... -- */ --#define BOOT_CPUCACHE_ENTRIES 1 --struct arraycache_init { -- struct array_cache cache; -- void *entries[BOOT_CPUCACHE_ENTRIES]; --}; -- --/* -- * The slab lists for all objects. -- */ --struct kmem_list3 { -- struct list_head slabs_partial; /* partial list first, better asm code */ -- struct list_head slabs_full; -- struct list_head slabs_free; -- unsigned long free_objects; -- unsigned long next_reap; -- int free_touched; -- unsigned int free_limit; -- unsigned int colour_next; /* Per-node cache coloring */ -- spinlock_t list_lock; -- struct array_cache *shared; /* shared per node */ -- struct array_cache **alien; /* on other nodes */ --}; -- --/* - * Need this for bootstrapping a per node allocator. - */ - #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) -@@ -364,79 +237,6 @@ static void kmem_list3_init(struct kmem_ - MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ - } while (0) - --/* -- * struct kmem_cache -- * -- * manages a cache. -- */ -- --struct kmem_cache { --/* 1) per-cpu data, touched during every alloc/free */ -- struct array_cache *array[NR_CPUS]; -- unsigned int batchcount; -- unsigned int limit; -- unsigned int shared; -- unsigned int buffer_size; --/* 2) touched by every alloc & free from the backend */ -- struct kmem_list3 *nodelists[MAX_NUMNODES]; -- unsigned int flags; /* constant flags */ -- unsigned int num; /* # of objs per slab */ -- spinlock_t spinlock; -- --/* 3) cache_grow/shrink */ -- /* order of pgs per slab (2^n) */ -- unsigned int gfporder; -- -- /* force GFP flags, e.g. GFP_DMA */ -- gfp_t gfpflags; -- -- size_t colour; /* cache colouring range */ -- unsigned int colour_off; /* colour offset */ -- struct kmem_cache *slabp_cache; -- unsigned int slab_size; -- unsigned int dflags; /* dynamic flags */ -- -- /* constructor func */ -- void (*ctor) (void *, struct kmem_cache *, unsigned long); -- -- /* de-constructor func */ -- void (*dtor) (void *, struct kmem_cache *, unsigned long); -- --/* 4) cache creation/removal */ -- const char *name; -- struct list_head next; -- --/* 5) statistics */ --#if STATS -- unsigned long num_active; -- unsigned long num_allocations; -- unsigned long high_mark; -- unsigned long grown; -- unsigned long reaped; -- unsigned long errors; -- unsigned long max_freeable; -- unsigned long node_allocs; -- unsigned long node_frees; -- atomic_t allochit; -- atomic_t allocmiss; -- atomic_t freehit; -- atomic_t freemiss; --#endif --#if DEBUG -- /* -- * If debugging is enabled, then the allocator can add additional -- * fields and/or padding to every object. buffer_size contains the total -- * object size including these internal fields, the following two -- * variables contain the offset to the user object and its size. -- */ -- int obj_offset; -- int obj_size; --#endif --}; -- --#define CFLGS_OFF_SLAB (0x80000000UL) --#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) -- - #define BATCHREFILL_LIMIT 16 - /* Optimization question: fewer reaps means less - * probability for unnessary cpucache drain/refill cycles. -@@ -573,42 +373,6 @@ static void **dbg_userword(struct kmem_c - #define BREAK_GFP_ORDER_LO 0 - static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; - --/* Functions for storing/retrieving the cachep and or slab from the -- * global 'mem_map'. These are used to find the slab an obj belongs to. -- * With kfree(), these are used to find the cache which an obj belongs to. -- */ --static inline void page_set_cache(struct page *page, struct kmem_cache *cache) --{ -- page->lru.next = (struct list_head *)cache; --} -- --static inline struct kmem_cache *page_get_cache(struct page *page) --{ -- return (struct kmem_cache *)page->lru.next; --} -- --static inline void page_set_slab(struct page *page, struct slab *slab) --{ -- page->lru.prev = (struct list_head *)slab; --} -- --static inline struct slab *page_get_slab(struct page *page) --{ -- return (struct slab *)page->lru.prev; --} -- --static inline struct kmem_cache *virt_to_cache(const void *obj) --{ -- struct page *page = virt_to_page(obj); -- return page_get_cache(page); --} -- --static inline struct slab *virt_to_slab(const void *obj) --{ -- struct page *page = virt_to_page(obj); -- return page_get_slab(page); --} -- - /* These are the default caches for kmalloc. Custom caches can have other sizes. */ - struct cache_sizes malloc_sizes[] = { - #define CACHE(x) { .cs_size = (x) }, -@@ -715,9 +479,17 @@ struct kmem_cache *kmem_find_general_cac - } - EXPORT_SYMBOL(kmem_find_general_cachep); - --static size_t slab_mgmt_size(size_t nr_objs, size_t align) -+static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags) - { -- return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); -+ size_t size_noub; -+ -+ size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t); -+ return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags); -+} -+ -+static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags) -+{ -+ return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align); - } - - /* Calculate the number of objects and left-over bytes for a given -@@ -761,20 +533,23 @@ static void cache_estimate(unsigned long - * into account. - */ - nr_objs = (slab_size - sizeof(struct slab)) / -- (buffer_size + sizeof(kmem_bufctl_t)); -+ (buffer_size + sizeof(kmem_bufctl_t) + -+ UB_EXTRA(flags)); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ -- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size -- > slab_size) -+ if (slab_mgmt_size(nr_objs, align, flags) + -+ nr_objs * buffer_size > slab_size) - nr_objs--; -+ BUG_ON(slab_mgmt_size(nr_objs, align, flags) + -+ nr_objs * buffer_size > slab_size); - - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; - -- mgmt_size = slab_mgmt_size(nr_objs, align); -+ mgmt_size = slab_mgmt_size(nr_objs, align, flags); - } - *num = nr_objs; - *left_over = slab_size - nr_objs*buffer_size - mgmt_size; -@@ -1254,6 +1029,7 @@ void __init kmem_cache_init(void) - sizes[INDEX_AC].cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | -+ SLAB_UBC|SLAB_NO_CHARGE | - SLAB_PANIC), NULL, NULL); - - if (INDEX_AC != INDEX_L3) -@@ -1261,8 +1037,9 @@ void __init kmem_cache_init(void) - kmem_cache_create(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, - ARCH_KMALLOC_MINALIGN, -- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, -- NULL); -+ (ARCH_KMALLOC_FLAGS | -+ SLAB_UBC | SLAB_NO_CHARGE | -+ SLAB_PANIC), NULL, NULL); - - while (sizes->cs_size != ULONG_MAX) { - /* -@@ -1277,14 +1054,14 @@ void __init kmem_cache_init(void) - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS -+ | SLAB_UBC -+ | SLAB_NO_CHARGE - | SLAB_PANIC), - NULL, NULL); - - /* Inc off-slab bufctl limit until the ceiling is hit. */ -- if (!(OFF_SLAB(sizes->cs_cachep))) { -- offslab_limit = sizes->cs_size - sizeof(struct slab); -- offslab_limit /= sizeof(kmem_bufctl_t); -- } -+ if (!(OFF_SLAB(sizes->cs_cachep))) -+ offslab_limit = sizes->cs_size; - - sizes->cs_dmacachep = kmem_cache_create(names->name_dma, - sizes->cs_size, -@@ -1704,8 +1481,13 @@ static inline size_t calculate_slab_orde - continue; - - /* More than offslab_limit objects will cause problems */ -- if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit) -- break; -+ if (flags & CFLGS_OFF_SLAB) { -+ unsigned long slab_size; -+ -+ slab_size = slab_mgmt_size_noalign(num, flags); -+ if (slab_size > offslab_limit) -+ break; -+ } - - /* Found something acceptable - save it away */ - cachep->num = num; -@@ -1950,8 +1732,7 @@ kmem_cache_create (const char *name, siz - cachep = NULL; - goto oops; - } -- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) -- + sizeof(struct slab), align); -+ slab_size = slab_mgmt_size(cachep->num, align, flags); - - /* - * If the slab has been placed off-slab, and we have enough space then -@@ -1964,8 +1745,7 @@ kmem_cache_create (const char *name, siz - - if (flags & CFLGS_OFF_SLAB) { - /* really off slab. No need for manual alignment */ -- slab_size = -- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); -+ slab_size = slab_mgmt_size_noalign(cachep->num, flags); - } - - cachep->colour_off = cache_line_size(); -@@ -2045,6 +1825,7 @@ kmem_cache_create (const char *name, siz - - /* cache setup completed, link it into the list */ - list_add(&cachep->next, &cache_chain); -+ set_cache_objuse(cachep); - oops: - if (!cachep && (flags & SLAB_PANIC)) - panic("kmem_cache_create(): failed to create slab `%s'\n", -@@ -2282,7 +2063,8 @@ static struct slab *alloc_slabmgmt(struc - - if (OFF_SLAB(cachep)) { - /* Slab management obj is off-slab. */ -- slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); -+ slabp = kmem_cache_alloc(cachep->slabp_cache, -+ local_flags & (~__GFP_UBC)); - if (!slabp) - return NULL; - } else { -@@ -2292,15 +2074,11 @@ static struct slab *alloc_slabmgmt(struc - slabp->inuse = 0; - slabp->colouroff = colour_off; - slabp->s_mem = objp + colour_off; -+ init_slab_ubps(cachep, slabp); - - return slabp; - } - --static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) --{ -- return (kmem_bufctl_t *) (slabp + 1); --} -- - static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slabp, unsigned long ctor_flags) - { -@@ -2470,7 +2248,7 @@ static int cache_grow(struct kmem_cache - /* Get mem for the objs. - * Attempt to allocate a physical page from 'nodeid', - */ -- if (!(objp = kmem_getpages(cachep, flags, nodeid))) -+ if (!(objp = kmem_getpages(cachep, flags & (~__GFP_UBC), nodeid))) - goto failed; - - /* Get slab management. */ -@@ -2823,6 +2601,11 @@ __cache_alloc(struct kmem_cache *cachep, - objp = cache_alloc_debugcheck_after(cachep, flags, objp, - caller); - prefetchw(objp); -+ -+ if (objp && ub_slab_charge(objp, flags)) { -+ kmem_cache_free(cachep, objp); -+ objp = NULL; -+ } - return objp; - } - -@@ -2997,6 +2780,8 @@ static inline void __cache_free(struct k - check_irq_off(); - objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - -+ ub_slab_uncharge(objp); -+ - /* Make sure we are not freeing a object from another - * node to the array cache on this cpu. - */ -@@ -3128,6 +2913,10 @@ void *kmem_cache_alloc_node(struct kmem_ - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, - __builtin_return_address(0)); - -+ if (ptr && ub_slab_charge(ptr, flags)) { -+ kmem_cache_free(cachep, ptr); -+ ptr = NULL; -+ } - return ptr; - } - EXPORT_SYMBOL(kmem_cache_alloc_node); -@@ -3543,6 +3332,7 @@ static void cache_reap(void *unused) - return; - } - -+ {KSTAT_PERF_ENTER(cache_reap) - list_for_each(walk, &cache_chain) { - struct kmem_cache *searchp; - struct list_head *p; -@@ -3608,6 +3398,7 @@ static void cache_reap(void *unused) - check_irq_on(); - mutex_unlock(&cache_chain_mutex); - next_reap_node(); -+ KSTAT_PERF_LEAVE(cache_reap)} - /* Setup the next iteration */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); - } -diff -upr linux-2.6.16.orig/mm/swap_state.c linux-2.6.16-026test009/mm/swap_state.c ---- linux-2.6.16.orig/mm/swap_state.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/swap_state.c 2006-04-19 15:02:12.000000000 +0400 -@@ -18,6 +18,8 @@ - - #include <asm/pgtable.h> - -+#include <ub/ub_vmpages.h> -+ - /* - * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_list, to make sync_page look nicer, and to allow -@@ -53,6 +55,7 @@ static struct { - unsigned long noent_race; - unsigned long exist_race; - } swap_cache_info; -+EXPORT_SYMBOL(swap_cache_info); - - void show_swap_cache_info(void) - { -@@ -151,7 +154,14 @@ int add_to_swap(struct page * page, gfp_ - BUG(); - - for (;;) { -- entry = get_swap_page(); -+ struct user_beancounter *ub; -+ -+ ub = pb_grab_page_ub(page); -+ if (IS_ERR(ub)) -+ return 0; -+ -+ entry = get_swap_page(ub); -+ put_beancounter(ub); - if (!entry.val) - return 0; - -diff -upr linux-2.6.16.orig/mm/swapfile.c linux-2.6.16-026test009/mm/swapfile.c ---- linux-2.6.16.orig/mm/swapfile.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/swapfile.c 2006-04-19 15:02:12.000000000 +0400 -@@ -33,6 +33,8 @@ - #include <asm/tlbflush.h> - #include <linux/swapops.h> - -+#include <ub/ub_vmpages.h> -+ - DEFINE_SPINLOCK(swap_lock); - unsigned int nr_swapfiles; - long total_swap_pages; -@@ -172,7 +174,7 @@ no_page: - return 0; - } - --swp_entry_t get_swap_page(void) -+swp_entry_t get_swap_page(struct user_beancounter *ub) - { - struct swap_info_struct *si; - pgoff_t offset; -@@ -202,6 +204,7 @@ swp_entry_t get_swap_page(void) - offset = scan_swap_map(si); - if (offset) { - spin_unlock(&swap_lock); -+ ub_swapentry_inc(si, offset, ub); - return swp_entry(type, offset); - } - next = swap_list.next; -@@ -277,6 +280,7 @@ static int swap_entry_free(struct swap_i - count--; - p->swap_map[offset] = count; - if (!count) { -+ ub_swapentry_dec(p, offset); - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) -@@ -423,11 +427,18 @@ void free_swap_and_cache(swp_entry_t ent - * force COW, vm_page_prot omits write permission from any private vma. - */ - static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, -- unsigned long addr, swp_entry_t entry, struct page *page) -+ unsigned long addr, swp_entry_t entry, struct page *page, -+ struct page_beancounter **pb) - { -- inc_mm_counter(vma->vm_mm, anon_rss); -+ struct mm_struct *mm; -+ -+ mm = vma->vm_mm; -+ inc_mm_counter(mm, anon_rss); -+ inc_vma_rss(vma); -+ ub_unused_privvm_dec(mm, vma); -+ pb_add_ref(page, mm, pb); - get_page(page); -- set_pte_at(vma->vm_mm, addr, pte, -+ set_pte_at(mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); - page_add_anon_rmap(page, vma, addr); - swap_free(entry); -@@ -440,7 +451,8 @@ static void unuse_pte(struct vm_area_str - - static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, -- swp_entry_t entry, struct page *page) -+ swp_entry_t entry, struct page *page, -+ struct page_beancounter **pb) - { - pte_t swp_pte = swp_entry_to_pte(entry); - pte_t *pte; -@@ -454,7 +466,7 @@ static int unuse_pte_range(struct vm_are - * Test inline before going to call unuse_pte. - */ - if (unlikely(pte_same(*pte, swp_pte))) { -- unuse_pte(vma, pte++, addr, entry, page); -+ unuse_pte(vma, pte++, addr, entry, page, pb); - found = 1; - break; - } -@@ -465,7 +477,8 @@ static int unuse_pte_range(struct vm_are - - static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, -- swp_entry_t entry, struct page *page) -+ swp_entry_t entry, struct page *page, -+ struct page_beancounter **pb) - { - pmd_t *pmd; - unsigned long next; -@@ -475,7 +488,7 @@ static inline int unuse_pmd_range(struct - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) - continue; -- if (unuse_pte_range(vma, pmd, addr, next, entry, page)) -+ if (unuse_pte_range(vma, pmd, addr, next, entry, page, pb)) - return 1; - } while (pmd++, addr = next, addr != end); - return 0; -@@ -483,7 +496,8 @@ static inline int unuse_pmd_range(struct - - static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, -- swp_entry_t entry, struct page *page) -+ swp_entry_t entry, struct page *page, -+ struct page_beancounter **pb) - { - pud_t *pud; - unsigned long next; -@@ -493,14 +507,15 @@ static inline int unuse_pud_range(struct - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; -- if (unuse_pmd_range(vma, pud, addr, next, entry, page)) -+ if (unuse_pmd_range(vma, pud, addr, next, entry, page, pb)) - return 1; - } while (pud++, addr = next, addr != end); - return 0; - } - - static int unuse_vma(struct vm_area_struct *vma, -- swp_entry_t entry, struct page *page) -+ swp_entry_t entry, struct page *page, -+ struct page_beancounter **pb) - { - pgd_t *pgd; - unsigned long addr, end, next; -@@ -521,14 +536,15 @@ static int unuse_vma(struct vm_area_stru - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; -- if (unuse_pud_range(vma, pgd, addr, next, entry, page)) -+ if (unuse_pud_range(vma, pgd, addr, next, entry, page, pb)) - return 1; - } while (pgd++, addr = next, addr != end); - return 0; - } - - static int unuse_mm(struct mm_struct *mm, -- swp_entry_t entry, struct page *page) -+ swp_entry_t entry, struct page *page, -+ struct page_beancounter **pb) - { - struct vm_area_struct *vma; - -@@ -543,7 +559,7 @@ static int unuse_mm(struct mm_struct *mm - lock_page(page); - } - for (vma = mm->mmap; vma; vma = vma->vm_next) { -- if (vma->anon_vma && unuse_vma(vma, entry, page)) -+ if (vma->anon_vma && unuse_vma(vma, entry, page, pb)) - break; - } - up_read(&mm->mmap_sem); -@@ -555,11 +571,12 @@ static int unuse_mm(struct mm_struct *mm - } - - #ifdef CONFIG_MIGRATION --int remove_vma_swap(struct vm_area_struct *vma, struct page *page) -+int remove_vma_swap(struct vm_area_struct *vma, struct page *page, -+ struct page_beancounter **pb) - { - swp_entry_t entry = { .val = page_private(page) }; - -- return unuse_vma(vma, entry, page); -+ return unuse_vma(vma, entry, page, pb); - } - #endif - -@@ -618,6 +635,7 @@ static int try_to_unuse(unsigned int typ - int retval = 0; - int reset_overflow = 0; - int shmem; -+ struct page_beancounter *pb; - - /* - * When searching mms for an entry, a good strategy is to -@@ -670,6 +688,13 @@ again: - break; - } - -+ pb = NULL; -+ if (pb_alloc_all(&pb)) { -+ page_cache_release(page); -+ retval = -ENOMEM; -+ break; -+ } -+ - /* - * Don't hold on to start_mm if it looks like exiting. - */ -@@ -698,6 +723,20 @@ again: - } - wait_on_page_writeback(page); - -+ /* If read failed we cannot map not-uptodate page to -+ * user space. Actually, we are in serious troubles, -+ * we do not even know what process to kill. So, the only -+ * variant remains: to stop swapoff() and allow someone -+ * to kill processes to zap invalid pages. -+ */ -+ if (unlikely(!PageUptodate(page))) { -+ pb_free_list(&pb); -+ unlock_page(page); -+ page_cache_release(page); -+ retval = -EIO; -+ break; -+ } -+ - /* - * Remove all references to entry. - * Whenever we reach init_mm, there's no address space -@@ -709,7 +748,7 @@ again: - if (start_mm == &init_mm) - shmem = shmem_unuse(entry, page); - else -- retval = unuse_mm(start_mm, entry, page); -+ retval = unuse_mm(start_mm, entry, page, &pb); - } - if (*swap_map > 1) { - int set_start_mm = (*swap_map >= swcount); -@@ -741,7 +780,7 @@ again: - set_start_mm = 1; - shmem = shmem_unuse(entry, page); - } else -- retval = unuse_mm(mm, entry, page); -+ retval = unuse_mm(mm, entry, page, &pb); - if (set_start_mm && *swap_map < swcount) { - mmput(new_start_mm); - atomic_inc(&mm->mm_users); -@@ -755,6 +794,8 @@ again: - mmput(start_mm); - start_mm = new_start_mm; - } -+ -+ pb_free_list(&pb); - if (retval) { - unlock_page(page); - page_cache_release(page); -@@ -1100,6 +1141,10 @@ asmlinkage long sys_swapoff(const char _ - int i, type, prev; - int err; - -+ /* VE admin check is just to be on the safe side, the admin may affect -+ * swaps only if he has access to special, i.e. if he has been granted -+ * access to the block device or if the swap file is in the area -+ * visible to him. */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - -@@ -1199,6 +1244,7 @@ asmlinkage long sys_swapoff(const char _ - spin_unlock(&swap_lock); - mutex_unlock(&swapon_mutex); - vfree(swap_map); -+ ub_swap_fini(p); - inode = mapping->host; - if (S_ISBLK(inode->i_mode)) { - struct block_device *bdev = I_BDEV(inode); -@@ -1557,6 +1603,11 @@ asmlinkage long sys_swapon(const char __ - goto bad_swap; - } - -+ if (ub_swap_init(p, maxpages)) { -+ error = -ENOMEM; -+ goto bad_swap; -+ } -+ - mutex_lock(&swapon_mutex); - spin_lock(&swap_lock); - p->flags = SWP_ACTIVE; -diff -upr linux-2.6.16.orig/mm/vmalloc.c linux-2.6.16-026test009/mm/vmalloc.c ---- linux-2.6.16.orig/mm/vmalloc.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/vmalloc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -20,6 +20,8 @@ - #include <asm/uaccess.h> - #include <asm/tlbflush.h> - -+#include <ub/ub_debug.h> -+ - - DEFINE_RWLOCK(vmlist_lock); - struct vm_struct *vmlist; -@@ -256,6 +258,68 @@ struct vm_struct *get_vm_area_node(unsig - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); - } - -+struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags) -+{ -+ unsigned long addr, best_addr, delta, best_delta; -+ struct vm_struct **p, **best_p, *tmp, *area; -+ -+ area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); -+ if (!area) -+ return NULL; -+ -+ size += PAGE_SIZE; /* one-page gap at the end */ -+ addr = VMALLOC_START; -+ best_addr = 0UL; -+ best_p = NULL; -+ best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START; -+ -+ write_lock(&vmlist_lock); -+ for (p = &vmlist; (tmp = *p) && -+ (tmp->addr <= (void *)PAGE_ALIGN(VMALLOC_END)); -+ p = &tmp->next) { -+ if ((size + addr) < addr) -+ break; -+ delta = (unsigned long) tmp->addr - (size + addr); -+ if (delta < best_delta) { -+ best_delta = delta; -+ best_addr = addr; -+ best_p = p; -+ } -+ addr = tmp->size + (unsigned long) tmp->addr; -+ if (addr > VMALLOC_END-size) -+ break; -+ } -+ -+ if (!tmp || (tmp->addr > (void *)PAGE_ALIGN(VMALLOC_END))) { -+ /* check free area after list end */ -+ delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr); -+ if (delta < best_delta) { -+ best_delta = delta; -+ best_addr = addr; -+ best_p = p; -+ } -+ } -+ if (best_addr) { -+ area->flags = flags; -+ /* allocate at the end of this area */ -+ area->addr = (void *)(best_addr + best_delta); -+ area->size = size; -+ area->next = *best_p; -+ area->pages = NULL; -+ area->nr_pages = 0; -+ area->phys_addr = 0; -+ *best_p = area; -+ /* check like in __vunmap */ -+ WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr); -+ } else { -+ kfree(area); -+ area = NULL; -+ } -+ write_unlock(&vmlist_lock); -+ -+ return area; -+} -+ - /* Caller must hold vmlist_lock */ - struct vm_struct *__remove_vm_area(void *addr) - { -@@ -296,7 +360,7 @@ struct vm_struct *remove_vm_area(void *a - return v; - } - --void __vunmap(void *addr, int deallocate_pages) -+void __vunmap(void *addr, int deallocate_pages, int uncharge) - { - struct vm_struct *area; - -@@ -320,6 +384,8 @@ void __vunmap(void *addr, int deallocate - if (deallocate_pages) { - int i; - -+ if (uncharge) -+ dec_vmalloc_charged(area); - for (i = 0; i < area->nr_pages; i++) { - if (unlikely(!area->pages[i])) - BUG(); -@@ -350,7 +416,7 @@ void __vunmap(void *addr, int deallocate - void vfree(void *addr) - { - BUG_ON(in_interrupt()); -- __vunmap(addr, 1); -+ __vunmap(addr, 1, 1); - } - EXPORT_SYMBOL(vfree); - -@@ -367,7 +433,7 @@ EXPORT_SYMBOL(vfree); - void vunmap(void *addr) - { - BUG_ON(in_interrupt()); -- __vunmap(addr, 0); -+ __vunmap(addr, 0, 0); - } - EXPORT_SYMBOL(vunmap); - -@@ -439,10 +505,12 @@ void *__vmalloc_area_node(struct vm_stru - - if (map_vm_area(area, prot, &pages)) - goto fail; -+ -+ inc_vmalloc_charged(area, gfp_mask); - return area->addr; - - fail: -- vfree(area->addr); -+ __vunmap(area->addr, 1, 0); - return NULL; - } - -@@ -486,6 +554,21 @@ void *__vmalloc(unsigned long size, gfp_ - } - EXPORT_SYMBOL(__vmalloc); - -+static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot) -+{ -+ struct vm_struct *area; -+ -+ size = PAGE_ALIGN(size); -+ if (!size || (size >> PAGE_SHIFT) > num_physpages) -+ return NULL; -+ -+ area = get_vm_area_best(size, VM_ALLOC); -+ if (!area) -+ return NULL; -+ -+ return __vmalloc_area_node(area, mask, prot, -1); -+} -+ - /** - * vmalloc - allocate virtually contiguous memory - * -@@ -503,6 +586,20 @@ void *vmalloc(unsigned long size) - } - EXPORT_SYMBOL(vmalloc); - -+void *vmalloc_best(unsigned long size) -+{ -+ return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); -+} -+ -+EXPORT_SYMBOL(vmalloc_best); -+ -+void *ub_vmalloc_best(unsigned long size) -+{ -+ return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); -+} -+ -+EXPORT_SYMBOL(ub_vmalloc_best); -+ - /** - * vmalloc_node - allocate memory on a specific node - * -@@ -631,3 +728,37 @@ finished: - read_unlock(&vmlist_lock); - return buf - buf_start; - } -+ -+void vprintstat(void) -+{ -+ struct vm_struct *p, *last_p = NULL; -+ unsigned long addr, size, free_size, max_free_size; -+ int num; -+ -+ addr = VMALLOC_START; -+ size = max_free_size = 0; -+ num = 0; -+ -+ read_lock(&vmlist_lock); -+ for (p = vmlist; p; p = p->next) { -+ free_size = (unsigned long)p->addr - addr; -+ if (free_size > max_free_size) -+ max_free_size = free_size; -+ addr = (unsigned long)p->addr + p->size; -+ size += p->size; -+ ++num; -+ last_p = p; -+ } -+ if (last_p) { -+ free_size = VMALLOC_END - -+ ((unsigned long)last_p->addr + last_p->size); -+ if (free_size > max_free_size) -+ max_free_size = free_size; -+ } -+ read_unlock(&vmlist_lock); -+ -+ printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n" -+ " Max_Free: %luKB Start: %lx End: %lx\n", -+ size/1024, (VMALLOC_END - VMALLOC_START)/1024, num, -+ max_free_size/1024, VMALLOC_START, VMALLOC_END); -+} -diff -upr linux-2.6.16.orig/mm/vmscan.c linux-2.6.16-026test009/mm/vmscan.c ---- linux-2.6.16.orig/mm/vmscan.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/mm/vmscan.c 2006-04-19 15:02:12.000000000 +0400 -@@ -1243,6 +1243,7 @@ refill_inactive_zone(struct zone *zone, - reclaim_mapped = 1; - } - -+ KSTAT_PERF_ENTER(refill_inact) - lru_add_drain(); - spin_lock_irq(&zone->lru_lock); - pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, -@@ -1322,6 +1323,7 @@ refill_inactive_zone(struct zone *zone, - local_irq_enable(); - - pagevec_release(&pvec); -+ KSTAT_PERF_LEAVE(refill_inact); - } - - /* -@@ -1438,6 +1440,7 @@ int try_to_free_pages(struct zone **zone - unsigned long lru_pages = 0; - int i; - -+ KSTAT_PERF_ENTER(ttfp); - sc.gfp_mask = gfp_mask; - sc.may_writepage = !laptop_mode; - sc.may_swap = 1; -@@ -1500,6 +1503,7 @@ out: - - zone->prev_priority = zone->temp_priority; - } -+ KSTAT_PERF_LEAVE(ttfp); - return ret; - } - -@@ -1832,7 +1836,8 @@ static int __init kswapd_init(void) - swap_setup(); - for_each_pgdat(pgdat) - pgdat->kswapd -- = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); -+ = find_task_by_pid_all(kernel_thread(kswapd, -+ pgdat, CLONE_KERNEL)); - total_memory = nr_free_pagecache_pages(); - hotcpu_notifier(cpu_callback, 0); - return 0; -diff -upr linux-2.6.16.orig/net/atm/clip.c linux-2.6.16-026test009/net/atm/clip.c ---- linux-2.6.16.orig/net/atm/clip.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/atm/clip.c 2006-04-19 15:02:11.000000000 +0400 -@@ -613,12 +613,19 @@ static int clip_create(int number) - - - static int clip_device_event(struct notifier_block *this,unsigned long event, -- void *dev) -+ void *arg) - { -+ struct net_device *dev = arg; -+ -+ if (event == NETDEV_UNREGISTER) { -+ neigh_ifdown(&clip_tbl, dev); -+ return NOTIFY_DONE; -+ } -+ - /* ignore non-CLIP devices */ -- if (((struct net_device *) dev)->type != ARPHRD_ATM || -- ((struct net_device *) dev)->hard_start_xmit != clip_start_xmit) -+ if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit) - return NOTIFY_DONE; -+ - switch (event) { - case NETDEV_UP: - DPRINTK("clip_device_event NETDEV_UP\n"); -@@ -686,14 +693,12 @@ static struct notifier_block clip_inet_n - static void atmarpd_close(struct atm_vcc *vcc) - { - DPRINTK("atmarpd_close\n"); -- atmarpd = NULL; /* assumed to be atomic */ -- barrier(); -- unregister_inetaddr_notifier(&clip_inet_notifier); -- unregister_netdevice_notifier(&clip_dev_notifier); -- if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) -- printk(KERN_ERR "atmarpd_close: closing with requests " -- "pending\n"); -+ -+ rtnl_lock(); -+ atmarpd = NULL; - skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); -+ rtnl_unlock(); -+ - DPRINTK("(done)\n"); - module_put(THIS_MODULE); - } -@@ -714,7 +719,12 @@ static struct atm_dev atmarpd_dev = { - - static int atm_init_atmarp(struct atm_vcc *vcc) - { -- if (atmarpd) return -EADDRINUSE; -+ rtnl_lock(); -+ if (atmarpd) { -+ rtnl_unlock(); -+ return -EADDRINUSE; -+ } -+ - if (start_timer) { - start_timer = 0; - init_timer(&idle_timer); -@@ -731,10 +741,7 @@ static int atm_init_atmarp(struct atm_vc - vcc->push = NULL; - vcc->pop = NULL; /* crash */ - vcc->push_oam = NULL; /* crash */ -- if (register_netdevice_notifier(&clip_dev_notifier)) -- printk(KERN_ERR "register_netdevice_notifier failed\n"); -- if (register_inetaddr_notifier(&clip_inet_notifier)) -- printk(KERN_ERR "register_inetaddr_notifier failed\n"); -+ rtnl_unlock(); - return 0; - } - -@@ -992,6 +999,8 @@ static int __init atm_clip_init(void) - - clip_tbl_hook = &clip_tbl; - register_atm_ioctl(&clip_ioctl_ops); -+ register_netdevice_notifier(&clip_dev_notifier); -+ register_inetaddr_notifier(&clip_inet_notifier); - - #ifdef CONFIG_PROC_FS - { -@@ -1012,6 +1021,9 @@ static void __exit atm_clip_exit(void) - - remove_proc_entry("arp", atm_proc_root); - -+ unregister_inetaddr_notifier(&clip_inet_notifier); -+ unregister_netdevice_notifier(&clip_dev_notifier); -+ - deregister_atm_ioctl(&clip_ioctl_ops); - - /* First, stop the idle timer, so it stops banging -diff -upr linux-2.6.16.orig/net/bridge/br_netfilter.c linux-2.6.16-026test009/net/bridge/br_netfilter.c ---- linux-2.6.16.orig/net/bridge/br_netfilter.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/bridge/br_netfilter.c 2006-04-19 15:02:11.000000000 +0400 -@@ -739,6 +739,15 @@ out: - return NF_STOLEN; - } - -+static int br_nf_dev_queue_xmit(struct sk_buff *skb) -+{ -+ if (skb->protocol == htons(ETH_P_IP) && -+ skb->len > skb->dev->mtu && -+ !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) -+ return ip_fragment(skb, br_dev_queue_push_xmit); -+ else -+ return br_dev_queue_push_xmit(skb); -+} - - /* PF_BRIDGE/POST_ROUTING ********************************************/ - static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, -@@ -798,7 +807,7 @@ static unsigned int br_nf_post_routing(u - realoutdev = nf_bridge->netoutdev; - #endif - NF_HOOK(pf, NF_IP_POST_ROUTING, skb, NULL, realoutdev, -- br_dev_queue_push_xmit); -+ br_nf_dev_queue_xmit); - - return NF_STOLEN; - -@@ -843,7 +852,7 @@ static unsigned int ip_sabotage_out(unsi - if ((out->hard_start_xmit == br_dev_xmit && - okfn != br_nf_forward_finish && - okfn != br_nf_local_out_finish && -- okfn != br_dev_queue_push_xmit) -+ okfn != br_nf_dev_queue_xmit) - #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) - || ((out->priv_flags & IFF_802_1Q_VLAN) && - VLAN_DEV_INFO(out)->real_dev->hard_start_xmit == br_dev_xmit) -diff -upr linux-2.6.16.orig/net/compat.c linux-2.6.16-026test009/net/compat.c ---- linux-2.6.16.orig/net/compat.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/compat.c 2006-04-19 15:02:11.000000000 +0400 -@@ -308,107 +308,6 @@ void scm_detach_fds_compat(struct msghdr - } - - /* -- * For now, we assume that the compatibility and native version -- * of struct ipt_entry are the same - sfr. FIXME -- */ --struct compat_ipt_replace { -- char name[IPT_TABLE_MAXNAMELEN]; -- u32 valid_hooks; -- u32 num_entries; -- u32 size; -- u32 hook_entry[NF_IP_NUMHOOKS]; -- u32 underflow[NF_IP_NUMHOOKS]; -- u32 num_counters; -- compat_uptr_t counters; /* struct ipt_counters * */ -- struct ipt_entry entries[0]; --}; -- --static int do_netfilter_replace(int fd, int level, int optname, -- char __user *optval, int optlen) --{ -- struct compat_ipt_replace __user *urepl; -- struct ipt_replace __user *repl_nat; -- char name[IPT_TABLE_MAXNAMELEN]; -- u32 origsize, tmp32, num_counters; -- unsigned int repl_nat_size; -- int ret; -- int i; -- compat_uptr_t ucntrs; -- -- urepl = (struct compat_ipt_replace __user *)optval; -- if (get_user(origsize, &urepl->size)) -- return -EFAULT; -- -- /* Hack: Causes ipchains to give correct error msg --RR */ -- if (optlen != sizeof(*urepl) + origsize) -- return -ENOPROTOOPT; -- -- /* XXX Assumes that size of ipt_entry is the same both in -- * native and compat environments. -- */ -- repl_nat_size = sizeof(*repl_nat) + origsize; -- repl_nat = compat_alloc_user_space(repl_nat_size); -- -- ret = -EFAULT; -- if (put_user(origsize, &repl_nat->size)) -- goto out; -- -- if (!access_ok(VERIFY_READ, urepl, optlen) || -- !access_ok(VERIFY_WRITE, repl_nat, optlen)) -- goto out; -- -- if (__copy_from_user(name, urepl->name, sizeof(urepl->name)) || -- __copy_to_user(repl_nat->name, name, sizeof(repl_nat->name))) -- goto out; -- -- if (__get_user(tmp32, &urepl->valid_hooks) || -- __put_user(tmp32, &repl_nat->valid_hooks)) -- goto out; -- -- if (__get_user(tmp32, &urepl->num_entries) || -- __put_user(tmp32, &repl_nat->num_entries)) -- goto out; -- -- if (__get_user(num_counters, &urepl->num_counters) || -- __put_user(num_counters, &repl_nat->num_counters)) -- goto out; -- -- if (__get_user(ucntrs, &urepl->counters) || -- __put_user(compat_ptr(ucntrs), &repl_nat->counters)) -- goto out; -- -- if (__copy_in_user(&repl_nat->entries[0], -- &urepl->entries[0], -- origsize)) -- goto out; -- -- for (i = 0; i < NF_IP_NUMHOOKS; i++) { -- if (__get_user(tmp32, &urepl->hook_entry[i]) || -- __put_user(tmp32, &repl_nat->hook_entry[i]) || -- __get_user(tmp32, &urepl->underflow[i]) || -- __put_user(tmp32, &repl_nat->underflow[i])) -- goto out; -- } -- -- /* -- * Since struct ipt_counters just contains two u_int64_t members -- * we can just do the access_ok check here and pass the (converted) -- * pointer into the standard syscall. We hope that the pointer is -- * not misaligned ... -- */ -- if (!access_ok(VERIFY_WRITE, compat_ptr(ucntrs), -- num_counters * sizeof(struct ipt_counters))) -- goto out; -- -- -- ret = sys_setsockopt(fd, level, optname, -- (char __user *)repl_nat, repl_nat_size); -- --out: -- return ret; --} -- --/* - * A struct sock_filter is architecture independent. - */ - struct compat_sock_fprog { -@@ -460,10 +359,6 @@ static int do_set_sock_timeout(int fd, i - asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, int optlen) - { -- /* SO_SET_REPLACE seems to be the same in all levels */ -- if (optname == IPT_SO_SET_REPLACE) -- return do_netfilter_replace(fd, level, optname, -- optval, optlen); - if (level == SOL_SOCKET && optname == SO_ATTACH_FILTER) - return do_set_attach_filter(fd, level, optname, - optval, optlen); -diff -upr linux-2.6.16.orig/net/core/datagram.c linux-2.6.16-026test009/net/core/datagram.c ---- linux-2.6.16.orig/net/core/datagram.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/datagram.c 2006-04-19 15:02:12.000000000 +0400 -@@ -56,6 +56,8 @@ - #include <net/sock.h> - #include <net/tcp_states.h> - -+#include <ub/ub_net.h> -+ - /* - * Is a socket 'connection oriented' ? - */ -@@ -493,6 +495,7 @@ unsigned int datagram_poll(struct file * - { - struct sock *sk = sock->sk; - unsigned int mask; -+ int no_ubc_space; - - poll_wait(file, sk->sk_sleep, wait); - mask = 0; -@@ -500,8 +503,14 @@ unsigned int datagram_poll(struct file * - /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; -- if (sk->sk_shutdown == SHUTDOWN_MASK) -+ if (sk->sk_shutdown == SHUTDOWN_MASK) { -+ no_ubc_space = 0; - mask |= POLLHUP; -+ } else { -+ no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); -+ if (no_ubc_space) -+ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); -+ } - - /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || -@@ -518,7 +527,7 @@ unsigned int datagram_poll(struct file * - } - - /* writable? */ -- if (sock_writeable(sk)) -+ if (!no_ubc_space && sock_writeable(sk)) - mask |= POLLOUT | POLLWRNORM | POLLWRBAND; - else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); -diff -upr linux-2.6.16.orig/net/core/dev.c linux-2.6.16-026test009/net/core/dev.c ---- linux-2.6.16.orig/net/core/dev.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/dev.c 2006-04-19 15:02:12.000000000 +0400 -@@ -115,6 +115,10 @@ - #include <net/iw_handler.h> - #endif /* CONFIG_NET_RADIO */ - #include <asm/current.h> -+#include <ub/beancounter.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_mem.h> - - /* - * The list of packet types we will receive (as opposed to discard) -@@ -167,25 +171,40 @@ static struct list_head ptype_all; /* T - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -+#ifdef CONFIG_VE -+#define dev_tail (get_exec_env()->_net_dev_tail) -+#else - struct net_device *dev_base; - static struct net_device **dev_tail = &dev_base; -+EXPORT_SYMBOL(dev_base); -+#endif - DEFINE_RWLOCK(dev_base_lock); - --EXPORT_SYMBOL(dev_base); - EXPORT_SYMBOL(dev_base_lock); - -+#ifdef CONFIG_VE -+#define MAX_UNMOVABLE_NETDEVICES (8*4096) -+static uint8_t unmovable_ifindex_list[MAX_UNMOVABLE_NETDEVICES/8]; -+static LIST_HEAD(dev_global_list); -+#endif -+ - #define NETDEV_HASHBITS 8 - static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; - static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; - --static inline struct hlist_head *dev_name_hash(const char *name) -+struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env) - { -- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); -+ unsigned hash; -+ if (!ve_is_super(env)) -+ return visible_dev_head(env); -+ hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; - } - --static inline struct hlist_head *dev_index_hash(int ifindex) -+struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env) - { -+ if (!ve_is_super(env)) -+ return visible_dev_index_head(env); - return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; - } - -@@ -469,7 +488,7 @@ struct net_device *__dev_get_by_name(con - { - struct hlist_node *p; - -- hlist_for_each(p, dev_name_hash(name)) { -+ hlist_for_each(p, dev_name_hash(name, get_exec_env())) { - struct net_device *dev - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(dev->name, name, IFNAMSIZ)) -@@ -502,6 +521,32 @@ struct net_device *dev_get_by_name(const - } - - /** -+ * __dev_global_get_by_name - find a device by its name in dev_global_list -+ * @name: name to find -+ * -+ * Find an interface by name. Must be called under RTNL semaphore -+ * If the name is found a pointer to the device -+ * is returned. If the name is not found then %NULL is returned. The -+ * reference counters are not incremented so the caller must be -+ * careful with locks. -+ */ -+ -+#ifdef CONFIG_VE -+struct net_device *__dev_global_get_by_name(const char *name) -+{ -+ struct net_device *dev; -+ /* It's called relatively rarely */ -+ list_for_each_entry(dev, &dev_global_list, dev_global_list_entry) { -+ if (strncmp(dev->name, name, IFNAMSIZ) == 0) -+ return dev; -+ } -+ return NULL; -+} -+#else /* CONFIG_VE */ -+#define __dev_global_get_by_name(name) __dev_get_by_name(name) -+#endif /* CONFIG_VE */ -+ -+/** - * __dev_get_by_index - find a device by its ifindex - * @ifindex: index of device - * -@@ -516,7 +561,7 @@ struct net_device *__dev_get_by_index(in - { - struct hlist_node *p; - -- hlist_for_each(p, dev_index_hash(ifindex)) { -+ hlist_for_each(p, dev_index_hash(ifindex, get_exec_env())) { - struct net_device *dev - = hlist_entry(p, struct net_device, index_hlist); - if (dev->ifindex == ifindex) -@@ -635,6 +680,23 @@ int dev_valid_name(const char *name) - || strchr(name, '/')); - } - -+static inline void __dev_check_name(const char *dev_name, const char *name, -+ long *inuse, const int max_netdevices) -+{ -+ int i = 0; -+ char buf[IFNAMSIZ]; -+ -+ if (!sscanf(dev_name, name, &i)) -+ return; -+ if (i < 0 || i >= max_netdevices) -+ return; -+ -+ /* avoid cases where sscanf is not exact inverse of printf */ -+ snprintf(buf, sizeof(buf), name, i); -+ if (!strncmp(buf, dev_name, IFNAMSIZ)) -+ set_bit(i, inuse); -+} -+ - /** - * dev_alloc_name - allocate a name for a device - * @dev: device -@@ -671,16 +733,20 @@ int dev_alloc_name(struct net_device *de - if (!inuse) - return -ENOMEM; - -- for (d = dev_base; d; d = d->next) { -- if (!sscanf(d->name, name, &i)) -- continue; -- if (i < 0 || i >= max_netdevices) -- continue; -- -- /* avoid cases where sscanf is not exact inverse of printf */ -- snprintf(buf, sizeof(buf), name, i); -- if (!strncmp(buf, d->name, IFNAMSIZ)) -- set_bit(i, inuse); -+#ifdef CONFIG_VE -+ if (ve_is_super(get_exec_env())) { -+ list_for_each_entry(d, &dev_global_list, -+ dev_global_list_entry) { -+ __dev_check_name(d->name, name, inuse, -+ max_netdevices); -+ } -+ } else -+#endif -+ { -+ for (d = dev_base; d; d = d->next) { -+ __dev_check_name(d->name, name, inuse, -+ max_netdevices); -+ } - } - - i = find_first_zero_bit(inuse, max_netdevices); -@@ -688,7 +754,11 @@ int dev_alloc_name(struct net_device *de - } - - snprintf(buf, sizeof(buf), name, i); -- if (!__dev_get_by_name(buf)) { -+ if (ve_is_super(get_exec_env())) -+ d = __dev_global_get_by_name(buf); -+ else -+ d = __dev_get_by_name(buf); -+ if (d == NULL) { - strlcpy(dev->name, buf, IFNAMSIZ); - return i; - } -@@ -721,13 +791,14 @@ int dev_change_name(struct net_device *d - if (!dev_valid_name(newname)) - return -EINVAL; - -+ /* Rename of devices in VE is prohibited by CAP_NET_ADMIN */ - if (strchr(newname, '%')) { - err = dev_alloc_name(dev, newname); - if (err < 0) - return err; - strcpy(newname, dev->name); - } -- else if (__dev_get_by_name(newname)) -+ else if (__dev_global_get_by_name(newname)) - return -EEXIST; - else - strlcpy(dev->name, newname, IFNAMSIZ); -@@ -735,7 +806,8 @@ int dev_change_name(struct net_device *d - err = class_device_rename(&dev->class_dev, dev->name); - if (!err) { - hlist_del(&dev->name_hlist); -- hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); -+ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, -+ get_exec_env())); - notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); - } - -@@ -1294,6 +1366,25 @@ int dev_queue_xmit(struct sk_buff *skb) - skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); - #endif - if (q->enqueue) { -+ struct user_beancounter *ub; -+ -+ ub = netdev_bc(dev)->exec_ub; -+ /* the skb CAN be already charged if it transmitted via -+ * something like bonding device */ -+ if (ub && (skb_bc(skb)->resource == 0)) { -+ unsigned long chargesize; -+ chargesize = skb_charge_fullsize(skb); -+ if (charge_beancounter(ub, UB_OTHERSOCKBUF, -+ chargesize, UB_SOFT)) { -+ rcu_read_unlock(); -+ rc = -ENOMEM; -+ goto out_kfree_skb; -+ } -+ skb_bc(skb)->ub = ub; -+ skb_bc(skb)->charged = chargesize; -+ skb_bc(skb)->resource = UB_OTHERSOCKBUF; -+ } -+ - /* Grab device queue */ - spin_lock(&dev->queue_lock); - -@@ -1580,6 +1671,7 @@ int netif_receive_skb(struct sk_buff *sk - struct net_device *orig_dev; - int ret = NET_RX_DROP; - unsigned short type; -+ struct ve_struct *old_env; - - /* if we've gotten here through NAPI, check netpoll */ - if (skb->dev->poll && netpoll_rx(skb)) -@@ -1598,6 +1690,17 @@ int netif_receive_skb(struct sk_buff *sk - skb->h.raw = skb->nh.raw = skb->data; - skb->mac_len = skb->nh.raw - skb->mac.raw; - -+#ifdef CONFIG_VE -+ /* -+ * Skb might be alloced in another VE context, than its device works. -+ * So, set the correct owner_env. -+ */ -+ skb->owner_env = skb->dev->owner_env; -+ BUG_ON(skb->owner_env == NULL); -+#endif -+ -+ old_env = set_exec_env(VE_OWNER_SKB(skb)); -+ - pt_prev = NULL; - - rcu_read_lock(); -@@ -1663,6 +1766,7 @@ ncls: - - out: - rcu_read_unlock(); -+ (void)set_exec_env(old_env); - return ret; - } - -@@ -2038,7 +2142,7 @@ static int __init dev_proc_init(void) - { - int rc = -ENOMEM; - -- if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) -+ if (!proc_glob_fops_create("net/dev", S_IRUGO, &dev_seq_fops)) - goto out; - if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) - goto out_dev; -@@ -2050,7 +2154,7 @@ out: - out_softnet: - proc_net_remove("softnet_stat"); - out_dev: -- proc_net_remove("dev"); -+ remove_proc_glob_entry("net/dev", NULL); - goto out; - } - #else -@@ -2115,6 +2219,9 @@ void dev_set_promiscuity(struct net_devi - dev->flags &= ~IFF_PROMISC; - else - dev->flags |= IFF_PROMISC; -+ /* Promiscous mode on these devices does not mean anything */ -+ if (dev->flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) -+ return; - if (dev->flags != old_flags) { - dev_mc_upload(dev); - printk(KERN_INFO "device %s %s promiscuous mode\n", -@@ -2529,9 +2636,28 @@ int dev_ioctl(unsigned int cmd, void __u - * - require strict serialization. - * - do not return a value - */ -+ case SIOCSIFMTU: -+ if (!capable(CAP_NET_ADMIN) && -+ !capable(CAP_VE_NET_ADMIN)) -+ return -EPERM; -+ dev_load(ifr.ifr_name); -+ rtnl_lock(); -+ if (!ve_is_super(get_exec_env())) { -+ struct net_device *dev; -+ ret = -ENODEV; -+ if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL) -+ goto out_set_mtu_unlock; -+ ret = -EPERM; -+ if (ifr.ifr_mtu > dev->orig_mtu) -+ goto out_set_mtu_unlock; -+ } -+ ret = dev_ifsioc(&ifr, cmd); -+out_set_mtu_unlock: -+ rtnl_unlock(); -+ return ret; -+ - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: -- case SIOCSIFMTU: - case SIOCSIFMAP: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: -@@ -2613,20 +2739,73 @@ int dev_ioctl(unsigned int cmd, void __u - * dev_new_index - allocate an ifindex - * - * Returns a suitable unique value for a new device interface -- * number. The caller must hold the rtnl semaphore or the -+ * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. -+ * -+ * Note: dev->name must be valid on entrance - */ --static int dev_new_index(void) -+static int dev_ve_new_index(void) - { -- static int ifindex; -+#ifdef CONFIG_VE -+ int *ifindex = &get_exec_env()->ifindex; -+ int delta = 2; -+#else -+ static int s_ifindex; -+ int *ifindex = &s_ifindex; -+ int delta = 1; -+#endif - for (;;) { -- if (++ifindex <= 0) -- ifindex = 1; -- if (!__dev_get_by_index(ifindex)) -- return ifindex; -+ *ifindex += delta; -+ if (*ifindex <= 0) -+ *ifindex = 1; -+ if (!__dev_get_by_index(*ifindex)) -+ return *ifindex; - } - } - -+#ifdef CONFIG_VE -+static int dev_glb_new_index(void) -+{ -+ int i; -+ -+ i = find_first_zero_bit((long*)unmovable_ifindex_list, -+ MAX_UNMOVABLE_NETDEVICES); -+ -+ if (i == MAX_UNMOVABLE_NETDEVICES) -+ return -EMFILE; -+ -+ __set_bit(i, (long*)unmovable_ifindex_list); -+ return (i + 1) * 2; -+} -+#endif -+ -+static void dev_glb_free_index(struct net_device *dev) -+{ -+#ifdef CONFIG_VE -+ int bit; -+ -+ bit = dev->ifindex / 2 - 1; -+ BUG_ON(bit >= MAX_UNMOVABLE_NETDEVICES); -+ __clear_bit(bit, (long*)unmovable_ifindex_list); -+#endif -+} -+ -+static int dev_new_index(struct net_device *dev) -+{ -+#ifdef CONFIG_VE -+ if (ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) -+ return dev_glb_new_index(); -+#endif -+ -+ return dev_ve_new_index(); -+} -+ -+static void dev_free_index(struct net_device *dev) -+{ -+ if ((dev->ifindex % 2) == 0) -+ dev_glb_free_index(dev); -+} -+ - static int dev_boot_phase = 1; - - /* Delayed registration/unregisteration */ -@@ -2669,6 +2848,10 @@ int register_netdevice(struct net_device - /* When net_device's are persistent, this will be fatal. */ - BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); - -+ ret = -EPERM; -+ if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) -+ goto out; -+ - spin_lock_init(&dev->queue_lock); - spin_lock_init(&dev->xmit_lock); - dev->xmit_lock_owner = -1; -@@ -2688,27 +2871,32 @@ int register_netdevice(struct net_device - if (ret) { - if (ret > 0) - ret = -EIO; -- goto out_err; -+ goto out_free_div; - } - } - - if (!dev_valid_name(dev->name)) { - ret = -EINVAL; -- goto out_err; -+ goto out_free_div; -+ } -+ -+ dev->ifindex = dev_new_index(dev); -+ if (dev->ifindex < 0) { -+ ret = dev->ifindex; -+ goto out_free_div; - } - -- dev->ifindex = dev_new_index(); - if (dev->iflink == -1) - dev->iflink = dev->ifindex; - - /* Check for existence of name */ -- head = dev_name_hash(dev->name); -+ head = dev_name_hash(dev->name, get_exec_env()); - hlist_for_each(p, head) { - struct net_device *d - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(d->name, dev->name, IFNAMSIZ)) { - ret = -EEXIST; -- goto out_err; -+ goto out_free_ind; - } - } - -@@ -2760,12 +2948,21 @@ int register_netdevice(struct net_device - set_bit(__LINK_STATE_PRESENT, &dev->state); - - dev->next = NULL; -+ dev->owner_env = get_exec_env(); -+ dev->orig_mtu = dev->mtu; -+ netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub()); -+ netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub()); - dev_init_scheduler(dev); -+#ifdef CONFIG_VE -+ if (ve_is_super(get_exec_env())) -+ list_add_tail(&dev->dev_global_list_entry, &dev_global_list); -+#endif - write_lock_bh(&dev_base_lock); - *dev_tail = dev; - dev_tail = &dev->next; - hlist_add_head(&dev->name_hlist, head); -- hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); -+ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, -+ get_exec_env())); - dev_hold(dev); - dev->reg_state = NETREG_REGISTERING; - write_unlock_bh(&dev_base_lock); -@@ -2779,7 +2976,9 @@ int register_netdevice(struct net_device - - out: - return ret; --out_err: -+out_free_ind: -+ dev_free_index(dev); -+out_free_div: - free_divert_blk(dev); - goto out; - } -@@ -2825,6 +3024,10 @@ int register_netdev(struct net_device *d - err = register_netdevice(dev); - out: - rtnl_unlock(); -+ if (err == 0 && dev->reg_state != NETREG_REGISTERED) { -+ unregister_netdev(dev); -+ err = -ENOMEM; -+ } - return err; - } - EXPORT_SYMBOL(register_netdev); -@@ -2907,6 +3110,7 @@ void netdev_run_todo(void) - { - struct list_head list = LIST_HEAD_INIT(list); - int err; -+ struct ve_struct *current_env; - - - /* Need to guard against multiple cpu's getting out of order. */ -@@ -2925,22 +3129,30 @@ void netdev_run_todo(void) - list_splice_init(&net_todo_list, &list); - spin_unlock(&net_todo_list_lock); - -+ current_env = get_exec_env(); - while (!list_empty(&list)) { - struct net_device *dev - = list_entry(list.next, struct net_device, todo_list); - list_del(&dev->todo_list); - -+ (void)set_exec_env(dev->owner_env); - switch(dev->reg_state) { - case NETREG_REGISTERING: - err = netdev_register_sysfs(dev); -- if (err) -+ if (err) { - printk(KERN_ERR "%s: failed sysfs registration (%d)\n", - dev->name, err); -+ dev->reg_state = NETREG_REGISTER_ERR; -+ break; -+ } - dev->reg_state = NETREG_REGISTERED; - break; - - case NETREG_UNREGISTERING: - netdev_unregister_sysfs(dev); -+ /* fall through */ -+ -+ case NETREG_REGISTER_ERR: - dev->reg_state = NETREG_UNREGISTERED; - - netdev_wait_allrefs(dev); -@@ -2951,6 +3163,10 @@ void netdev_run_todo(void) - BUG_TRAP(!dev->ip6_ptr); - BUG_TRAP(!dev->dn_ptr); - -+ put_beancounter(netdev_bc(dev)->exec_ub); -+ put_beancounter(netdev_bc(dev)->owner_ub); -+ netdev_bc(dev)->exec_ub = NULL; -+ netdev_bc(dev)->owner_ub = NULL; - - /* It must be the very last action, - * after this 'dev' may point to freed up memory. -@@ -2965,6 +3181,7 @@ void netdev_run_todo(void) - break; - } - } -+ (void)set_exec_env(current_env); - - out: - up(&net_todo_run_mutex); -@@ -2990,7 +3207,7 @@ struct net_device *alloc_netdev(int size - alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; - alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; - -- p = kmalloc(alloc_size, GFP_KERNEL); -+ p = ub_kmalloc(alloc_size, GFP_KERNEL); - if (!p) { - printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); - return NULL; -@@ -3070,7 +3287,8 @@ int unregister_netdevice(struct net_devi - return -ENODEV; - } - -- BUG_ON(dev->reg_state != NETREG_REGISTERED); -+ BUG_ON(dev->reg_state != NETREG_REGISTERED && -+ dev->reg_state != NETREG_REGISTER_ERR); - - /* If device is running, close it first. */ - if (dev->flags & IFF_UP) -@@ -3086,6 +3304,10 @@ int unregister_netdevice(struct net_devi - dev_tail = dp; - *dp = d->next; - write_unlock_bh(&dev_base_lock); -+#ifdef CONFIG_VE -+ if (ve_is_super(get_exec_env())) -+ list_del(&dev->dev_global_list_entry); -+#endif - break; - } - } -@@ -3095,7 +3317,8 @@ int unregister_netdevice(struct net_devi - return -ENODEV; - } - -- dev->reg_state = NETREG_UNREGISTERING; -+ if (dev->reg_state != NETREG_REGISTER_ERR) -+ dev->reg_state = NETREG_UNREGISTERING; - - synchronize_net(); - -@@ -3119,6 +3342,8 @@ int unregister_netdevice(struct net_devi - /* Notifier chain MUST detach us from master device. */ - BUG_TRAP(!dev->master); - -+ dev_free_index(dev); -+ - free_divert_blk(dev); - - /* Finish processing unregister after unlock */ -@@ -3276,6 +3501,8 @@ EXPORT_SYMBOL(dev_close); - EXPORT_SYMBOL(dev_get_by_flags); - EXPORT_SYMBOL(dev_get_by_index); - EXPORT_SYMBOL(dev_get_by_name); -+EXPORT_SYMBOL(dev_name_hash); -+EXPORT_SYMBOL(dev_index_hash); - EXPORT_SYMBOL(dev_open); - EXPORT_SYMBOL(dev_queue_xmit); - EXPORT_SYMBOL(dev_remove_pack); -diff -upr linux-2.6.16.orig/net/core/dev_mcast.c linux-2.6.16-026test009/net/core/dev_mcast.c ---- linux-2.6.16.orig/net/core/dev_mcast.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/dev_mcast.c 2006-04-19 15:02:12.000000000 +0400 -@@ -290,9 +290,10 @@ static struct file_operations dev_mc_seq - - void __init dev_mcast_init(void) - { -- proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops); -+ proc_glob_fops_create("net/dev_mcast", 0, &dev_mc_seq_fops); - } - - EXPORT_SYMBOL(dev_mc_add); - EXPORT_SYMBOL(dev_mc_delete); - EXPORT_SYMBOL(dev_mc_upload); -+EXPORT_SYMBOL(dev_mc_discard); -diff -upr linux-2.6.16.orig/net/core/dst.c linux-2.6.16-026test009/net/core/dst.c ---- linux-2.6.16.orig/net/core/dst.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/dst.c 2006-04-19 15:02:12.000000000 +0400 -@@ -260,11 +260,14 @@ static int dst_dev_event(struct notifier - switch (event) { - case NETDEV_UNREGISTER: - case NETDEV_DOWN: -- spin_lock_bh(&dst_lock); -+ local_bh_disable(); -+ dst_run_gc(0); -+ spin_lock(&dst_lock); - for (dst = dst_garbage_list; dst; dst = dst->next) { - dst_ifdown(dst, dev, event != NETDEV_DOWN); - } -- spin_unlock_bh(&dst_lock); -+ spin_unlock(&dst_lock); -+ local_bh_enable(); - break; - } - return NOTIFY_DONE; -diff -upr linux-2.6.16.orig/net/core/filter.c linux-2.6.16-026test009/net/core/filter.c ---- linux-2.6.16.orig/net/core/filter.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/filter.c 2006-04-19 15:02:11.000000000 +0400 -@@ -34,6 +34,7 @@ - #include <linux/timer.h> - #include <asm/system.h> - #include <asm/uaccess.h> -+#include <asm/unaligned.h> - #include <linux/filter.h> - - /* No hurry in this branch */ -@@ -177,7 +178,7 @@ unsigned int sk_run_filter(struct sk_buf - load_w: - ptr = load_pointer(skb, k, 4, &tmp); - if (ptr != NULL) { -- A = ntohl(*(u32 *)ptr); -+ A = ntohl(get_unaligned((u32 *)ptr)); - continue; - } - break; -@@ -186,7 +187,7 @@ load_w: - load_h: - ptr = load_pointer(skb, k, 2, &tmp); - if (ptr != NULL) { -- A = ntohs(*(u16 *)ptr); -+ A = ntohs(get_unaligned((u16 *)ptr)); - continue; - } - break; -@@ -406,7 +407,7 @@ int sk_attach_filter(struct sock_fprog * - if (fprog->filter == NULL) - return -EINVAL; - -- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); -+ fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC); - if (!fp) - return -ENOMEM; - if (copy_from_user(fp->insns, fprog->filter, fsize)) { -diff -upr linux-2.6.16.orig/net/core/neighbour.c linux-2.6.16-026test009/net/core/neighbour.c ---- linux-2.6.16.orig/net/core/neighbour.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/neighbour.c 2006-04-19 15:02:12.000000000 +0400 -@@ -727,6 +727,11 @@ static void neigh_timer_handler(unsigned - struct neighbour *neigh = (struct neighbour *)arg; - unsigned state; - int notify = 0; -+ struct ve_struct *env; -+ struct user_beancounter *ub; -+ -+ env = set_exec_env(neigh->dev->owner_env); -+ ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub); - - write_lock(&neigh->lock); - -@@ -824,6 +829,8 @@ out: - neigh_app_notify(neigh); - #endif - neigh_release(neigh); -+ (void)set_exec_ub(ub); -+ (void)set_exec_env(env); - } - - int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) -@@ -1213,6 +1220,12 @@ static void neigh_proxy_process(unsigned - skb = skb->next; - if (tdif <= 0) { - struct net_device *dev = back->dev; -+ struct ve_struct *env; -+ struct user_beancounter *ub; -+ -+ env = set_exec_env(dev->owner_env); -+ ub = set_exec_ub(netdev_bc(dev)->exec_ub); -+ - __skb_unlink(back, &tbl->proxy_queue); - if (tbl->proxy_redo && netif_running(dev)) - tbl->proxy_redo(back); -@@ -1220,6 +1233,9 @@ static void neigh_proxy_process(unsigned - kfree_skb(back); - - dev_put(dev); -+ -+ (void)set_exec_ub(ub); -+ (void)set_exec_env(env); - } else if (!sched_next || tdif < sched_next) - sched_next = tdif; - } -@@ -1424,6 +1440,9 @@ int neigh_delete(struct sk_buff *skb, st - struct net_device *dev = NULL; - int err = -ENODEV; - -+ if (!ve_is_super(get_exec_env())) -+ return -EACCES; -+ - if (ndm->ndm_ifindex && - (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) - goto out; -@@ -1475,6 +1494,9 @@ int neigh_add(struct sk_buff *skb, struc - struct net_device *dev = NULL; - int err = -ENODEV; - -+ if (!ve_is_super(get_exec_env())) -+ return -EACCES; -+ - if (ndm->ndm_ifindex && - (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) - goto out; -@@ -1936,6 +1958,9 @@ int neigh_dump_info(struct sk_buff *skb, - struct neigh_table *tbl; - int t, family, s_t; - -+ if (!ve_is_super(get_exec_env())) -+ return -EACCES; -+ - read_lock(&neigh_tbl_lock); - family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family; - s_t = cb->args[0]; -@@ -2530,11 +2555,17 @@ int neigh_sysctl_register(struct net_dev - int p_id, int pdev_id, char *p_name, - proc_handler *handler, ctl_handler *strategy) - { -- struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); -+ struct neigh_sysctl_table *t; - const char *dev_name_source = NULL; - char *dev_name = NULL; - int err = 0; - -+ /* This function is called from VExx only from devinet_init, -+ and it is does not matter what is returned */ -+ if (!ve_is_super(get_exec_env())) -+ return 0; -+ -+ t = kmalloc(sizeof(*t), GFP_KERNEL); - if (!t) - return -ENOBUFS; - memcpy(t, &neigh_sysctl_template, sizeof(*t)); -@@ -2625,6 +2656,8 @@ int neigh_sysctl_register(struct net_dev - - void neigh_sysctl_unregister(struct neigh_parms *p) - { -+ if (!ve_is_super(get_exec_env())) -+ return; - if (p->sysctl_table) { - struct neigh_sysctl_table *t = p->sysctl_table; - p->sysctl_table = NULL; -diff -upr linux-2.6.16.orig/net/core/net-sysfs.c linux-2.6.16-026test009/net/core/net-sysfs.c ---- linux-2.6.16.orig/net/core/net-sysfs.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/net-sysfs.c 2006-04-19 15:02:12.000000000 +0400 -@@ -388,12 +388,13 @@ static void netdev_release(struct class_ - struct net_device *dev - = container_of(cd, struct net_device, class_dev); - -- BUG_ON(dev->reg_state != NETREG_RELEASED); -+ BUG_ON(dev->reg_state != NETREG_RELEASED && -+ dev->reg_state != NETREG_REGISTERING); - - kfree((char *)dev - dev->padded); - } - --static struct class net_class = { -+struct class net_class = { - .name = "net", - .release = netdev_release, - .class_dev_attrs = net_class_attributes, -@@ -401,6 +402,13 @@ static struct class net_class = { - .uevent = netdev_uevent, - #endif - }; -+EXPORT_SYMBOL(net_class); -+ -+#ifndef CONFIG_VE -+#define visible_net_class net_class -+#else -+#define visible_net_class (*get_exec_env()->net_class) -+#endif - - void netdev_unregister_sysfs(struct net_device * net) - { -@@ -424,7 +432,7 @@ int netdev_register_sysfs(struct net_dev - struct class_device *class_dev = &(net->class_dev); - int ret; - -- class_dev->class = &net_class; -+ class_dev->class = &visible_net_class; - class_dev->class_data = net; - - strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE); -@@ -453,12 +461,21 @@ out_cleanup: - out_unreg: - printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n", - net->name, ret); -- class_device_unregister(class_dev); -+ /* put is called in free_netdev() */ -+ class_device_del(class_dev); - out: - return ret; - } - -+void prepare_sysfs_netdev(void) -+{ -+#ifdef CONFIG_VE -+ get_ve0()->net_class = &net_class; -+#endif -+} -+ - int netdev_sysfs_init(void) - { -+ prepare_sysfs_netdev(); - return class_register(&net_class); - } -diff -upr linux-2.6.16.orig/net/core/rtnetlink.c linux-2.6.16-026test009/net/core/rtnetlink.c ---- linux-2.6.16.orig/net/core/rtnetlink.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/rtnetlink.c 2006-04-19 15:02:12.000000000 +0400 -@@ -434,6 +434,8 @@ static int rtnetlink_dump_all(struct sk_ - if (rtnetlink_links[idx] == NULL || - rtnetlink_links[idx][type].dumpit == NULL) - continue; -+ if (vz_security_proto_check(idx, 0, 0)) -+ continue; - if (idx > s_idx) - memset(&cb->args[0], 0, sizeof(cb->args)); - if (rtnetlink_links[idx][type].dumpit(skb, cb)) -@@ -501,7 +503,7 @@ rtnetlink_rcv_msg(struct sk_buff *skb, s - return 0; - - family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; -- if (family >= NPROTO) { -+ if (family >= NPROTO || vz_security_proto_check(family, 0, 0)) { - *errp = -EAFNOSUPPORT; - return -1; - } -diff -upr linux-2.6.16.orig/net/core/scm.c linux-2.6.16-026test009/net/core/scm.c ---- linux-2.6.16.orig/net/core/scm.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/scm.c 2006-04-19 15:02:12.000000000 +0400 -@@ -34,6 +34,7 @@ - #include <net/compat.h> - #include <net/scm.h> - -+#include <ub/ub_mem.h> - - /* - * Only allow a user to send credentials, that they could set with -@@ -42,7 +43,9 @@ - - static __inline__ int scm_check_creds(struct ucred *creds) - { -- if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) && -+ if ((creds->pid == virt_tgid(current) || -+ creds->pid == current->tgid || -+ capable(CAP_VE_SYS_ADMIN)) && - ((creds->uid == current->uid || creds->uid == current->euid || - creds->uid == current->suid) || capable(CAP_SETUID)) && - ((creds->gid == current->gid || creds->gid == current->egid || -@@ -69,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *c - - if (!fpl) - { -- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); -+ fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - *fplp = fpl; -@@ -275,7 +278,7 @@ struct scm_fp_list *scm_fp_dup(struct sc - if (!fpl) - return NULL; - -- new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); -+ new_fpl = ub_kmalloc(sizeof(*fpl), GFP_KERNEL); - if (new_fpl) { - for (i=fpl->count-1; i>=0; i--) - get_file(fpl->fp[i]); -diff -upr linux-2.6.16.orig/net/core/skbuff.c linux-2.6.16-026test009/net/core/skbuff.c ---- linux-2.6.16.orig/net/core/skbuff.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/skbuff.c 2006-04-19 15:02:12.000000000 +0400 -@@ -48,6 +48,7 @@ - #include <linux/in.h> - #include <linux/inet.h> - #include <linux/slab.h> -+#include <linux/kmem_cache.h> - #include <linux/netdevice.h> - #ifdef CONFIG_NET_CLS_ACT - #include <net/pkt_sched.h> -@@ -68,6 +69,8 @@ - #include <asm/uaccess.h> - #include <asm/system.h> - -+#include <ub/ub_net.h> -+ - static kmem_cache_t *skbuff_head_cache __read_mostly; - static kmem_cache_t *skbuff_fclone_cache __read_mostly; - -@@ -147,6 +150,9 @@ struct sk_buff *__alloc_skb(unsigned int - if (!skb) - goto out; - -+ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) -+ goto nobc; -+ - /* Get the DATA. Size must match skb_add_mtu(). */ - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); -@@ -160,6 +166,7 @@ struct sk_buff *__alloc_skb(unsigned int - skb->data = data; - skb->tail = data; - skb->end = data + size; -+ SET_VE_OWNER_SKB(skb, get_exec_env()); - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - atomic_set(&shinfo->dataref, 1); -@@ -182,6 +189,8 @@ struct sk_buff *__alloc_skb(unsigned int - out: - return skb; - nodata: -+ ub_skb_free_bc(skb); -+nobc: - kmem_cache_free(cache, skb); - skb = NULL; - goto out; -@@ -214,6 +223,9 @@ struct sk_buff *alloc_skb_from_cache(kme - if (!skb) - goto out; - -+ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) -+ goto nobc; -+ - /* Get the DATA. */ - size = SKB_DATA_ALIGN(size); - data = kmem_cache_alloc(cp, gfp_mask); -@@ -227,6 +239,7 @@ struct sk_buff *alloc_skb_from_cache(kme - skb->data = data; - skb->tail = data; - skb->end = data + size; -+ SET_VE_OWNER_SKB(skb, get_exec_env()); - - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; -@@ -236,6 +249,8 @@ struct sk_buff *alloc_skb_from_cache(kme - out: - return skb; - nodata: -+ ub_skb_free_bc(skb); -+nobc: - kmem_cache_free(skbuff_head_cache, skb); - skb = NULL; - goto out; -@@ -290,6 +305,7 @@ void kfree_skbmem(struct sk_buff *skb) - atomic_t *fclone_ref; - - skb_release_data(skb); -+ ub_skb_free_bc(skb); - switch (skb->fclone) { - case SKB_FCLONE_UNAVAILABLE: - kmem_cache_free(skbuff_head_cache, skb); -@@ -331,6 +347,7 @@ void __kfree_skb(struct sk_buff *skb) - #ifdef CONFIG_XFRM - secpath_put(skb->sp); - #endif -+ ub_skb_uncharge(skb); - if (skb->destructor) { - WARN_ON(in_irq()); - skb->destructor(skb); -@@ -386,6 +403,11 @@ struct sk_buff *skb_clone(struct sk_buff - n->fclone = SKB_FCLONE_UNAVAILABLE; - } - -+ if (ub_skb_alloc_bc(n, gfp_mask)) { -+ kmem_cache_free(skbuff_head_cache, n); -+ return NULL; -+ } -+ - #define C(x) n->x = skb->x - - n->next = n->prev = NULL; -@@ -415,6 +437,7 @@ struct sk_buff *skb_clone(struct sk_buff - C(ipvs_property); - #endif - C(protocol); -+ SET_VE_OWNER_SKB(n, VE_OWNER_SKB(skb)); - n->destructor = NULL; - #ifdef CONFIG_NETFILTER - C(nfmark); -diff -upr linux-2.6.16.orig/net/core/sock.c linux-2.6.16-026test009/net/core/sock.c ---- linux-2.6.16.orig/net/core/sock.c 2006-04-19 15:02:01.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/sock.c 2006-04-19 15:02:12.000000000 +0400 -@@ -108,6 +108,7 @@ - #include <linux/net.h> - #include <linux/mm.h> - #include <linux/slab.h> -+#include <linux/kmem_cache.h> - #include <linux/interrupt.h> - #include <linux/poll.h> - #include <linux/tcp.h> -@@ -124,6 +125,9 @@ - #include <net/xfrm.h> - #include <linux/ipsec.h> - -+#include <ub/ub_net.h> -+#include <ub/beancounter.h> -+ - #include <linux/filter.h> - - #ifdef CONFIG_INET -@@ -172,7 +176,7 @@ static void sock_warn_obsolete_bsdism(co - static char warncomm[TASK_COMM_LEN]; - if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); -- printk(KERN_WARNING "process `%s' is using obsolete " -+ ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete " - "%s SO_BSDCOMPAT\n", warncomm, name); - warned++; - } -@@ -404,8 +408,9 @@ set_rcvbuf: - if (!valbool) { - sk->sk_bound_dev_if = 0; - } else { -- if (optlen > IFNAMSIZ) -- optlen = IFNAMSIZ; -+ if (optlen > IFNAMSIZ - 1) -+ optlen = IFNAMSIZ - 1; -+ memset(devname, 0, sizeof(devname)); - if (copy_from_user(devname, optval, optlen)) { - ret = -EFAULT; - break; -@@ -659,6 +664,7 @@ struct sock *sk_alloc(int family, gfp_t - */ - sk->sk_prot = sk->sk_prot_creator = prot; - sock_lock_init(sk); -+ SET_VE_OWNER_SK(sk, get_exec_env()); - } - - if (security_sk_alloc(sk, family, priority)) -@@ -698,6 +704,7 @@ void sk_free(struct sock *sk) - __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); - - security_sk_free(sk); -+ ub_sock_uncharge(sk); - if (sk->sk_prot_creator->slab != NULL) - kmem_cache_free(sk->sk_prot_creator->slab, sk); - else -@@ -714,6 +721,11 @@ struct sock *sk_clone(const struct sock - - memcpy(newsk, sk, sk->sk_prot->obj_size); - -+ if (ub_sock_charge(newsk, sk->sk_family, sk->sk_type) < 0) { -+ sk_free(newsk); -+ return NULL; -+ } -+ - /* SANITY */ - sk_node_init(&newsk->sk_node); - sock_lock_init(newsk); -@@ -934,14 +946,12 @@ static long sock_wait_for_wmem(struct so - /* - * Generic send/receive buffer handlers - */ -- --static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, -- unsigned long header_len, -- unsigned long data_len, -- int noblock, int *errcode) -+struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size, -+ unsigned long size2, int noblock, -+ int *errcode) - { - struct sk_buff *skb; -- gfp_t gfp_mask; -+ unsigned int gfp_mask; - long timeo; - int err; - -@@ -959,46 +969,35 @@ static struct sk_buff *sock_alloc_send_p - if (sk->sk_shutdown & SEND_SHUTDOWN) - goto failure; - -- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { -- skb = alloc_skb(header_len, sk->sk_allocation); -- if (skb) { -- int npages; -- int i; -- -- /* No pages, we're done... */ -- if (!data_len) -- break; -- -- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; -- skb->truesize += data_len; -- skb_shinfo(skb)->nr_frags = npages; -- for (i = 0; i < npages; i++) { -- struct page *page; -- skb_frag_t *frag; -- -- page = alloc_pages(sk->sk_allocation, 0); -- if (!page) { -- err = -ENOBUFS; -- skb_shinfo(skb)->nr_frags = i; -- kfree_skb(skb); -- goto failure; -- } -- -- frag = &skb_shinfo(skb)->frags[i]; -- frag->page = page; -- frag->page_offset = 0; -- frag->size = (data_len >= PAGE_SIZE ? -- PAGE_SIZE : -- data_len); -- data_len -= PAGE_SIZE; -- } -+ if (ub_sock_getwres_other(sk, skb_charge_size(size))) { -+ if (size2 < size) { -+ size = size2; -+ continue; -+ } -+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); -+ err = -EAGAIN; -+ if (!timeo) -+ goto failure; -+ if (signal_pending(current)) -+ goto interrupted; -+ timeo = ub_sock_wait_for_space(sk, timeo, -+ skb_charge_size(size)); -+ continue; -+ } - -+ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { -+ skb = alloc_skb(size, sk->sk_allocation); -+ if (skb) - /* Full success... */ - break; -- } -+ ub_sock_retwres_other(sk, skb_charge_size(size), -+ SOCK_MIN_UBCSPACE_CH); - err = -ENOBUFS; - goto failure; - } -+ ub_sock_retwres_other(sk, -+ skb_charge_size(size), -+ SOCK_MIN_UBCSPACE_CH); - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; -@@ -1009,6 +1008,7 @@ static struct sk_buff *sock_alloc_send_p - timeo = sock_wait_for_wmem(sk, timeo); - } - -+ ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF); - skb_set_owner_w(skb, sk); - return skb; - -@@ -1022,7 +1022,7 @@ failure: - struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, - int noblock, int *errcode) - { -- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); -+ return sock_alloc_send_skb2(sk, size, size, noblock, errcode); - } - - static void __lock_sock(struct sock *sk) -@@ -1462,7 +1462,8 @@ int proto_register(struct proto *prot, i - - if (alloc_slab) { - prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, -- SLAB_HWCACHE_ALIGN, NULL, NULL); -+ SLAB_HWCACHE_ALIGN | SLAB_UBC, -+ NULL, NULL); - - if (prot->slab == NULL) { - printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", -@@ -1478,9 +1479,11 @@ int proto_register(struct proto *prot, i - goto out_free_sock_slab; - - sprintf(request_sock_slab_name, mask, prot->name); -- prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, -- prot->rsk_prot->obj_size, 0, -- SLAB_HWCACHE_ALIGN, NULL, NULL); -+ prot->rsk_prot->slab = -+ kmem_cache_create(request_sock_slab_name, -+ prot->rsk_prot->obj_size, 0, -+ SLAB_HWCACHE_ALIGN | SLAB_UBC, -+ NULL, NULL); - - if (prot->rsk_prot->slab == NULL) { - printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", -@@ -1501,7 +1504,7 @@ int proto_register(struct proto *prot, i - prot->twsk_prot->twsk_slab = - kmem_cache_create(timewait_sock_slab_name, - prot->twsk_prot->twsk_obj_size, -- 0, SLAB_HWCACHE_ALIGN, -+ 0, SLAB_HWCACHE_ALIGN | SLAB_UBC, - NULL, NULL); - if (prot->twsk_prot->twsk_slab == NULL) - goto out_free_timewait_sock_slab_name; -diff -upr linux-2.6.16.orig/net/core/stream.c linux-2.6.16-026test009/net/core/stream.c ---- linux-2.6.16.orig/net/core/stream.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/core/stream.c 2006-04-19 15:02:12.000000000 +0400 -@@ -111,8 +111,9 @@ EXPORT_SYMBOL(sk_stream_wait_close); - * sk_stream_wait_memory - Wait for more memory for a socket - * @sk: socket to wait for memory - * @timeo_p: for how long -+ * @amount - amount of memory to wait for (in UB space!) - */ --int sk_stream_wait_memory(struct sock *sk, long *timeo_p) -+int sk_stream_wait_memory(struct sock *sk, long *timeo_p, unsigned long amount) - { - int err = 0; - long vm_wait = 0; -@@ -134,8 +135,11 @@ int sk_stream_wait_memory(struct sock *s - if (signal_pending(current)) - goto do_interrupted; - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); -- if (sk_stream_memory_free(sk) && !vm_wait) -- break; -+ if (amount == 0) { -+ if (sk_stream_memory_free(sk) && !vm_wait) -+ break; -+ } else -+ ub_sock_sndqueueadd_tcp(sk, amount); - - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - sk->sk_write_pending++; -@@ -144,6 +148,8 @@ int sk_stream_wait_memory(struct sock *s - sk_stream_memory_free(sk) && - vm_wait); - sk->sk_write_pending--; -+ if (amount > 0) -+ ub_sock_sndqueuedel(sk); - - if (vm_wait) { - vm_wait -= current_timeo; -diff -upr linux-2.6.16.orig/net/ipv4/af_inet.c linux-2.6.16-026test009/net/ipv4/af_inet.c ---- linux-2.6.16.orig/net/ipv4/af_inet.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/af_inet.c 2006-04-19 15:02:12.000000000 +0400 -@@ -114,6 +114,7 @@ - #ifdef CONFIG_IP_MROUTE - #include <linux/mroute.h> - #endif -+#include <ub/ub_net.h> - - DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; - -@@ -298,6 +299,13 @@ lookup_protocol: - if (sk == NULL) - goto out; - -+ err = -ENOBUFS; -+ if (ub_sock_charge(sk, PF_INET, sock->type)) -+ goto out_sk_free; -+ /* if charge was successful, sock_init_data() MUST be called to -+ * set sk->sk_type. otherwise sk will be uncharged to wrong resource -+ */ -+ - err = 0; - sk->sk_no_check = answer_no_check; - if (INET_PROTOSW_REUSE & answer_flags) -@@ -355,6 +363,9 @@ out: - out_rcu_unlock: - rcu_read_unlock(); - goto out; -+out_sk_free: -+ sk_free(sk); -+ return err; - } - - -@@ -369,6 +380,9 @@ int inet_release(struct socket *sock) - - if (sk) { - long timeout; -+ struct ve_struct *saved_env; -+ -+ saved_env = set_exec_env(VE_OWNER_SK(sk)); - - /* Applications forget to leave groups before exiting */ - ip_mc_drop_socket(sk); -@@ -386,6 +400,8 @@ int inet_release(struct socket *sock) - timeout = sk->sk_lingertime; - sock->sk = NULL; - sk->sk_prot->close(sk, timeout); -+ -+ (void)set_exec_env(saved_env); - } - return 0; - } -@@ -1108,20 +1124,20 @@ static struct net_protocol icmp_protocol - - static int __init init_ipv4_mibs(void) - { -- net_statistics[0] = alloc_percpu(struct linux_mib); -- net_statistics[1] = alloc_percpu(struct linux_mib); -- ip_statistics[0] = alloc_percpu(struct ipstats_mib); -- ip_statistics[1] = alloc_percpu(struct ipstats_mib); -- icmp_statistics[0] = alloc_percpu(struct icmp_mib); -- icmp_statistics[1] = alloc_percpu(struct icmp_mib); -- tcp_statistics[0] = alloc_percpu(struct tcp_mib); -- tcp_statistics[1] = alloc_percpu(struct tcp_mib); -- udp_statistics[0] = alloc_percpu(struct udp_mib); -- udp_statistics[1] = alloc_percpu(struct udp_mib); -+ ve_net_statistics[0] = alloc_percpu(struct linux_mib); -+ ve_net_statistics[1] = alloc_percpu(struct linux_mib); -+ ve_ip_statistics[0] = alloc_percpu(struct ipstats_mib); -+ ve_ip_statistics[1] = alloc_percpu(struct ipstats_mib); -+ ve_icmp_statistics[0] = alloc_percpu(struct icmp_mib); -+ ve_icmp_statistics[1] = alloc_percpu(struct icmp_mib); -+ ve_tcp_statistics[0] = alloc_percpu(struct tcp_mib); -+ ve_tcp_statistics[1] = alloc_percpu(struct tcp_mib); -+ ve_udp_statistics[0] = alloc_percpu(struct udp_mib); -+ ve_udp_statistics[1] = alloc_percpu(struct udp_mib); - if (! -- (net_statistics[0] && net_statistics[1] && ip_statistics[0] -- && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] -- && udp_statistics[0] && udp_statistics[1])) -+ (ve_net_statistics[0] && ve_net_statistics[1] && ve_ip_statistics[0] -+ && ve_ip_statistics[1] && ve_tcp_statistics[0] && ve_tcp_statistics[1] -+ && ve_udp_statistics[0] && ve_udp_statistics[1])) - return -ENOMEM; - - (void) tcp_mib_init(); -diff -upr linux-2.6.16.orig/net/ipv4/arp.c linux-2.6.16-026test009/net/ipv4/arp.c ---- linux-2.6.16.orig/net/ipv4/arp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/arp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -988,7 +988,7 @@ static int arp_req_set(struct arpreq *r, - return 0; - } - if (dev == NULL) { -- ipv4_devconf.proxy_arp = 1; -+ ve_ipv4_devconf.proxy_arp = 1; - return 0; - } - if (__in_dev_get_rtnl(dev)) { -@@ -1094,7 +1094,7 @@ static int arp_req_delete(struct arpreq - return pneigh_delete(&arp_tbl, &ip, dev); - if (mask == 0) { - if (dev == NULL) { -- ipv4_devconf.proxy_arp = 0; -+ ve_ipv4_devconf.proxy_arp = 0; - return 0; - } - if (__in_dev_get_rtnl(dev)) { -@@ -1145,6 +1145,8 @@ int arp_ioctl(unsigned int cmd, void __u - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - case SIOCGARP: -+ if (!ve_is_super(get_exec_env())) -+ return -EACCES; - err = copy_from_user(&r, arg, sizeof(struct arpreq)); - if (err) - return -EFAULT; -@@ -1372,8 +1374,12 @@ static int arp_seq_open(struct inode *in - { - struct seq_file *seq; - int rc = -ENOMEM; -- struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL); -- -+ struct neigh_seq_state *s; -+ -+ if (!ve_is_super(get_exec_env())) -+ return -EPERM; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - goto out; - -@@ -1401,7 +1407,7 @@ static struct file_operations arp_seq_fo - - static int __init arp_proc_init(void) - { -- if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops)) -+ if (!proc_glob_fops_create("net/arp", S_IRUGO, &arp_seq_fops)) - return -ENOMEM; - return 0; - } -diff -upr linux-2.6.16.orig/net/ipv4/devinet.c linux-2.6.16-026test009/net/ipv4/devinet.c ---- linux-2.6.16.orig/net/ipv4/devinet.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/devinet.c 2006-04-19 15:02:13.000000000 +0400 -@@ -71,7 +71,7 @@ struct ipv4_devconf ipv4_devconf = { - .shared_media = 1, - }; - --static struct ipv4_devconf ipv4_devconf_dflt = { -+struct ipv4_devconf ipv4_devconf_dflt = { - .accept_redirects = 1, - .send_redirects = 1, - .secure_redirects = 1, -@@ -79,10 +79,16 @@ static struct ipv4_devconf ipv4_devconf_ - .accept_source_route = 1, - }; - -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ve_ipv4_devconf_dflt (*(get_exec_env()->_ipv4_devconf_dflt)) -+#else -+#define ve_ipv4_devconf_dflt ipv4_devconf_dflt -+#endif -+ - static void rtmsg_ifa(int event, struct in_ifaddr *); - - static struct notifier_block *inetaddr_chain; --static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, -+void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, - int destroy); - #ifdef CONFIG_SYSCTL - static void devinet_sysctl_register(struct in_device *in_dev, -@@ -92,7 +98,7 @@ static void devinet_sysctl_unregister(st - - /* Locks all the inet devices. */ - --static struct in_ifaddr *inet_alloc_ifa(void) -+struct in_ifaddr *inet_alloc_ifa(void) - { - struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); - -@@ -103,6 +109,7 @@ static struct in_ifaddr *inet_alloc_ifa( - - return ifa; - } -+EXPORT_SYMBOL_GPL(inet_alloc_ifa); - - static void inet_rcu_free_ifa(struct rcu_head *head) - { -@@ -175,6 +182,7 @@ out_kfree: - in_dev = NULL; - goto out; - } -+EXPORT_SYMBOL_GPL(inetdev_init); - - static void in_dev_rcu_put(struct rcu_head *head) - { -@@ -190,7 +198,7 @@ static void inetdev_destroy(struct in_de - ASSERT_RTNL(); - - dev = in_dev->dev; -- if (dev == &loopback_dev) -+ if (dev == &ve0_loopback) - return; - - in_dev->dead = 1; -@@ -232,7 +240,7 @@ int inet_addr_onlink(struct in_device *i - return 0; - } - --static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, -+void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, - int destroy) - { - struct in_ifaddr *promote = NULL; -@@ -320,7 +328,7 @@ static void inet_del_ifa(struct in_devic - } - } - --static int inet_insert_ifa(struct in_ifaddr *ifa) -+int inet_insert_ifa(struct in_ifaddr *ifa) - { - struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap, **last_primary; -@@ -370,6 +378,7 @@ static int inet_insert_ifa(struct in_ifa - - return 0; - } -+EXPORT_SYMBOL_GPL(inet_insert_ifa); - - static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) - { -@@ -578,7 +587,7 @@ int devinet_ioctl(unsigned int cmd, void - - case SIOCSIFFLAGS: - ret = -EACCES; -- if (!capable(CAP_NET_ADMIN)) -+ if (!capable(CAP_VE_NET_ADMIN)) - goto out; - break; - case SIOCSIFADDR: /* Set interface address (and family) */ -@@ -586,7 +595,7 @@ int devinet_ioctl(unsigned int cmd, void - case SIOCSIFDSTADDR: /* Set the destination address */ - case SIOCSIFNETMASK: /* Set the netmask for the interface */ - ret = -EACCES; -- if (!capable(CAP_NET_ADMIN)) -+ if (!capable(CAP_VE_NET_ADMIN)) - goto out; - ret = -EINVAL; - if (sin->sin_family != AF_INET) -@@ -1163,10 +1172,10 @@ static struct rtnetlink_link inet_rtnetl - void inet_forward_change(void) - { - struct net_device *dev; -- int on = ipv4_devconf.forwarding; -+ int on = ve_ipv4_devconf.forwarding; - -- ipv4_devconf.accept_redirects = !on; -- ipv4_devconf_dflt.forwarding = on; -+ ve_ipv4_devconf.accept_redirects = !on; -+ ve_ipv4_devconf_dflt.forwarding = on; - - read_lock(&dev_base_lock); - for (dev = dev_base; dev; dev = dev->next) { -@@ -1191,9 +1200,9 @@ static int devinet_sysctl_forward(ctl_ta - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - - if (write && *valp != val) { -- if (valp == &ipv4_devconf.forwarding) -+ if (valp == &ve_ipv4_devconf.forwarding) - inet_forward_change(); -- else if (valp != &ipv4_devconf_dflt.forwarding) -+ else if (valp != &ve_ipv4_devconf_dflt.forwarding) - rt_cache_flush(0); - } - -@@ -1464,30 +1473,22 @@ static struct devinet_sysctl_table { - }, - }; - --static void devinet_sysctl_register(struct in_device *in_dev, -- struct ipv4_devconf *p) -+static struct devinet_sysctl_table *__devinet_sysctl_register(char *dev_name, -+ int ifindex, struct ipv4_devconf *p) - { - int i; -- struct net_device *dev = in_dev ? in_dev->dev : NULL; -- struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); -- char *dev_name = NULL; -+ struct devinet_sysctl_table *t; - -+ t = kmalloc(sizeof(*t), GFP_KERNEL); - if (!t) -- return; -+ goto out; -+ - memcpy(t, &devinet_sysctl, sizeof(*t)); - for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { - t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; - t->devinet_vars[i].de = NULL; - } - -- if (dev) { -- dev_name = dev->name; -- t->devinet_dev[0].ctl_name = dev->ifindex; -- } else { -- dev_name = "default"; -- t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; -- } -- - /* - * Make a copy of dev_name, because '.procname' is regarded as const - * by sysctl and we wouldn't want anyone to change it under our feet -@@ -1495,8 +1496,9 @@ static void devinet_sysctl_register(stru - */ - dev_name = kstrdup(dev_name, GFP_KERNEL); - if (!dev_name) -- goto free; -+ goto out_free_table; - -+ t->devinet_dev[0].ctl_name = ifindex; - t->devinet_dev[0].procname = dev_name; - t->devinet_dev[0].child = t->devinet_vars; - t->devinet_dev[0].de = NULL; -@@ -1509,17 +1511,38 @@ static void devinet_sysctl_register(stru - - t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); - if (!t->sysctl_header) -- goto free_procname; -+ goto out_free_procname; - -- p->sysctl = t; -- return; -+ return t; - - /* error path */ -- free_procname: -+out_free_procname: - kfree(dev_name); -- free: -+out_free_table: - kfree(t); -- return; -+out: -+ printk(KERN_DEBUG "Can't register net/ipv4/conf sysctls.\n"); -+ return NULL; -+} -+ -+static void devinet_sysctl_register(struct in_device *in_dev, -+ struct ipv4_devconf *p) -+{ -+ struct net_device *dev; -+ char *dev_name; -+ int ifindex; -+ -+ dev = in_dev ? in_dev->dev : NULL; -+ -+ if (dev) { -+ dev_name = dev->name; -+ ifindex = dev->ifindex; -+ } else { -+ dev_name = "default"; -+ ifindex = NET_PROTO_CONF_DEFAULT; -+ } -+ -+ p->sysctl = __devinet_sysctl_register(dev_name, ifindex, p); - } - - static void devinet_sysctl_unregister(struct ipv4_devconf *p) -@@ -1532,7 +1555,170 @@ static void devinet_sysctl_unregister(st - kfree(t); - } - } -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+static ctl_table net_sysctl_tables[] = { -+ /* 0: net */ -+ { -+ .ctl_name = CTL_NET, -+ .procname = "net", -+ .mode = 0555, -+ .child = &net_sysctl_tables[2], -+ }, -+ { .ctl_name = 0, }, -+ /* 2: net/ipv4 */ -+ { -+ .ctl_name = NET_IPV4, -+ .procname = "ipv4", -+ .mode = 0555, -+ .child = &net_sysctl_tables[4], -+ }, -+ { .ctl_name = 0, }, -+ /* 4, 5: net/ipv4/[vars] */ -+ { -+ .ctl_name = NET_IPV4_FORWARD, -+ .procname = "ip_forward", -+ .data = &ipv4_devconf.forwarding, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &ipv4_sysctl_forward, -+ .strategy = &ipv4_sysctl_forward_strategy, -+ }, -+ { -+ .ctl_name = NET_IPV4_ROUTE, -+ .procname = "route", -+ .maxlen = 0, -+ .mode = 0555, -+ .child = &net_sysctl_tables[7], -+ }, -+ { .ctl_name = 0 }, -+ /* 7: net/ipv4/route/flush */ -+ { -+ .ctl_name = NET_IPV4_ROUTE_FLUSH, -+ .procname = "flush", -+ .data = NULL, /* setuped below */ -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &ipv4_sysctl_rtcache_flush, -+ .strategy = &ipv4_sysctl_rtcache_flush_strategy, -+ }, -+ { .ctl_name = 0 }, -+}; -+ -+static int ip_forward_sysctl_register(struct ve_struct *ve, -+ struct ipv4_devconf *p) -+{ -+ struct ctl_table_header *hdr; -+ ctl_table *root; -+ -+ root = clone_sysctl_template(net_sysctl_tables, -+ sizeof(net_sysctl_tables) / sizeof(ctl_table)); -+ if (root == NULL) -+ goto out; -+ -+ root[4].data = &p->forwarding; -+ root[7].data = &ipv4_flush_delay; -+ -+ hdr = register_sysctl_table(root, 1); -+ if (hdr == NULL) -+ goto out_free; -+ -+ ve->forward_header = hdr; -+ ve->forward_table = root; -+ return 0; -+ -+out_free: -+ free_sysctl_clone(root); -+out: -+ return -ENOMEM; -+} -+ -+static inline void ip_forward_sysctl_unregister(struct ve_struct *ve) -+{ -+ unregister_sysctl_table(ve->forward_header); -+ ve->forward_header = NULL; -+} -+ -+static inline void ip_forward_sysctl_free(struct ve_struct *ve) -+{ -+ free_sysctl_clone(ve->forward_table); -+ ve->forward_table = NULL; -+} -+#endif -+#endif -+ -+int devinet_sysctl_init(struct ve_struct *ve) -+{ -+ int err = 0; -+#ifdef CONFIG_SYSCTL -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ struct ipv4_devconf *conf, *conf_def; -+ -+ err = -ENOMEM; -+ -+ conf = kmalloc(sizeof(*conf), GFP_KERNEL); -+ if (!conf) -+ goto err1; -+ -+ memcpy(conf, &ipv4_devconf, sizeof(*conf)); -+ conf->sysctl = __devinet_sysctl_register("all", -+ NET_PROTO_CONF_ALL, conf); -+ if (!conf->sysctl) -+ goto err2; -+ -+ conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL); -+ if (!conf_def) -+ goto err3; -+ -+ memcpy(conf_def, &ipv4_devconf_dflt, sizeof(*conf_def)); -+ conf_def->sysctl = __devinet_sysctl_register("default", -+ NET_PROTO_CONF_DEFAULT, conf_def); -+ if (!conf_def->sysctl) -+ goto err4; -+ -+ err = ip_forward_sysctl_register(ve, conf); -+ if (err) -+ goto err5; -+ -+ ve->_ipv4_devconf = conf; -+ ve->_ipv4_devconf_dflt = conf_def; -+ return 0; -+ -+err5: -+ devinet_sysctl_unregister(conf_def); -+err4: -+ kfree(conf_def); -+err3: -+ devinet_sysctl_unregister(conf); -+err2: -+ kfree(conf); -+err1: - #endif -+#endif -+ return err; -+} -+ -+void devinet_sysctl_fini(struct ve_struct *ve) -+{ -+#ifdef CONFIG_SYSCTL -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ ip_forward_sysctl_unregister(ve); -+ devinet_sysctl_unregister(ve->_ipv4_devconf); -+ devinet_sysctl_unregister(ve->_ipv4_devconf_dflt); -+#endif -+#endif -+} -+ -+void devinet_sysctl_free(struct ve_struct *ve) -+{ -+#ifdef CONFIG_SYSCTL -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+ ip_forward_sysctl_free(ve); -+ kfree(ve->_ipv4_devconf); -+ kfree(ve->_ipv4_devconf_dflt); -+#endif -+#endif -+} - - void __init devinet_init(void) - { -@@ -1542,13 +1728,18 @@ void __init devinet_init(void) - #ifdef CONFIG_SYSCTL - devinet_sysctl.sysctl_header = - register_sysctl_table(devinet_sysctl.devinet_root_dir, 0); -- devinet_sysctl_register(NULL, &ipv4_devconf_dflt); -+ __devinet_sysctl_register("default", NET_PROTO_CONF_DEFAULT, -+ &ipv4_devconf_dflt); - #endif - } - - EXPORT_SYMBOL(devinet_ioctl); - EXPORT_SYMBOL(in_dev_finish_destroy); - EXPORT_SYMBOL(inet_select_addr); -+EXPORT_SYMBOL(inet_del_ifa); - EXPORT_SYMBOL(inetdev_by_index); -+EXPORT_SYMBOL(devinet_sysctl_init); -+EXPORT_SYMBOL(devinet_sysctl_fini); -+EXPORT_SYMBOL(devinet_sysctl_free); - EXPORT_SYMBOL(register_inetaddr_notifier); - EXPORT_SYMBOL(unregister_inetaddr_notifier); -diff -upr linux-2.6.16.orig/net/ipv4/fib_frontend.c linux-2.6.16-026test009/net/ipv4/fib_frontend.c ---- linux-2.6.16.orig/net/ipv4/fib_frontend.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/fib_frontend.c 2006-04-19 15:02:12.000000000 +0400 -@@ -53,14 +53,46 @@ - - #define RT_TABLE_MIN RT_TABLE_MAIN - -+#undef ip_fib_local_table -+#undef ip_fib_main_table - struct fib_table *ip_fib_local_table; - struct fib_table *ip_fib_main_table; -+void prepare_fib_tables(void) -+{ -+#ifdef CONFIG_VE -+ get_ve0()->_local_table = ip_fib_local_table; -+ ip_fib_local_table = (struct fib_table *)0x12345678; -+ get_ve0()->_main_table = ip_fib_main_table; -+ ip_fib_main_table = (struct fib_table *)0x12345678; -+#endif -+} -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define ip_fib_local_table get_exec_env()->_local_table -+#define ip_fib_main_table get_exec_env()->_main_table -+#endif - - #else - - #define RT_TABLE_MIN 1 - -+#undef fib_tables - struct fib_table *fib_tables[RT_TABLE_MAX+1]; -+void prepare_fib_tables(void) -+{ -+#ifdef CONFIG_VE -+ int i; -+ -+ BUG_ON(sizeof(fib_tables) != -+ sizeof(((struct ve_struct *)0)->_fib_tables)); -+ memcpy(get_ve0()->_fib_tables, fib_tables, sizeof(fib_tables)); -+ for (i = 0; i <= RT_TABLE_MAX; i++) -+ fib_tables[i] = (void *)0x12366678; -+#endif -+} -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define fib_tables get_exec_env()->_fib_tables -+#endif - - struct fib_table *__fib_new_table(int id) - { -@@ -250,7 +282,7 @@ int ip_rt_ioctl(unsigned int cmd, void _ - switch (cmd) { - case SIOCADDRT: /* Add a route */ - case SIOCDELRT: /* Delete a route */ -- if (!capable(CAP_NET_ADMIN)) -+ if (!capable(CAP_VE_NET_ADMIN)) - return -EPERM; - if (copy_from_user(&r, arg, sizeof(struct rtentry))) - return -EFAULT; -@@ -653,6 +685,7 @@ static struct notifier_block fib_netdev_ - - void __init ip_fib_init(void) - { -+ prepare_fib_tables(); - #ifndef CONFIG_IP_MULTIPLE_TABLES - ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); - ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); -diff -upr linux-2.6.16.orig/net/ipv4/fib_hash.c linux-2.6.16-026test009/net/ipv4/fib_hash.c ---- linux-2.6.16.orig/net/ipv4/fib_hash.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/fib_hash.c 2006-04-19 15:02:12.000000000 +0400 -@@ -36,6 +36,7 @@ - #include <linux/skbuff.h> - #include <linux/netlink.h> - #include <linux/init.h> -+#include <linux/ve.h> - - #include <net/ip.h> - #include <net/protocol.h> -@@ -73,11 +74,6 @@ struct fn_zone { - * can be cheaper than memory lookup, so that FZ_* macros are used. - */ - --struct fn_hash { -- struct fn_zone *fn_zones[33]; -- struct fn_zone *fn_zone_list; --}; -- - static inline u32 fn_hash(u32 key, struct fn_zone *fz) - { - u32 h = ntohl(key)>>(32 - fz->fz_order); -@@ -623,7 +619,7 @@ fn_hash_delete(struct fib_table *tb, str - return -ESRCH; - } - --static int fn_flush_list(struct fn_zone *fz, int idx) -+static int fn_flush_list(struct fn_zone *fz, int idx, int destroy) - { - struct hlist_head *head = &fz->fz_hash[idx]; - struct hlist_node *node, *n; -@@ -638,7 +634,9 @@ static int fn_flush_list(struct fn_zone - list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - -- if (fi && (fi->fib_flags&RTNH_F_DEAD)) { -+ if (fi == NULL) -+ continue; -+ if (destroy || (fi->fib_flags&RTNH_F_DEAD)) { - write_lock_bh(&fib_hash_lock); - list_del(&fa->fa_list); - if (list_empty(&f->fn_alias)) { -@@ -660,7 +658,7 @@ static int fn_flush_list(struct fn_zone - return found; - } - --static int fn_hash_flush(struct fib_table *tb) -+static int __fn_hash_flush(struct fib_table *tb, int destroy) - { - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz; -@@ -670,11 +668,84 @@ static int fn_hash_flush(struct fib_tabl - int i; - - for (i = fz->fz_divisor - 1; i >= 0; i--) -- found += fn_flush_list(fz, i); -+ found += fn_flush_list(fz, i, destroy); - } - return found; - } - -+static int fn_hash_flush(struct fib_table *tb) -+{ -+ return __fn_hash_flush(tb, 0); -+} -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+void fib_hash_destroy(struct fib_table *tb) -+{ -+ __fn_hash_flush(tb, 1); -+ kfree(tb); -+} -+ -+/* -+ * Initialization of virtualized networking subsystem. -+ */ -+int init_ve_route(struct ve_struct *ve) -+{ -+#ifdef CONFIG_IP_MULTIPLE_TABLES -+ if (fib_rules_create()) -+ return -ENOMEM; -+ ve->_fib_tables[RT_TABLE_LOCAL] = fib_hash_init(RT_TABLE_LOCAL); -+ if (!ve->_fib_tables[RT_TABLE_LOCAL]) -+ goto out_destroy; -+ ve->_fib_tables[RT_TABLE_MAIN] = fib_hash_init(RT_TABLE_MAIN); -+ if (!ve->_fib_tables[RT_TABLE_MAIN]) -+ goto out_destroy_local; -+ -+ return 0; -+ -+out_destroy_local: -+ fib_hash_destroy(ve->_fib_tables[RT_TABLE_LOCAL]); -+out_destroy: -+ fib_rules_destroy(); -+ ve->_local_rule = NULL; -+ return -ENOMEM; -+#else -+ ve->_local_table = fib_hash_init(RT_TABLE_LOCAL); -+ if (!ve->_local_table) -+ return -ENOMEM; -+ ve->_main_table = fib_hash_init(RT_TABLE_MAIN); -+ if (!ve->_main_table) { -+ fib_hash_destroy(ve->_local_table); -+ return -ENOMEM; -+ } -+ return 0; -+#endif -+} -+ -+void fini_ve_route(struct ve_struct *ve) -+{ -+#ifdef CONFIG_IP_MULTIPLE_TABLES -+ int i; -+ for (i=0; i<RT_TABLE_MAX+1; i++) -+ { -+ if (!ve->_fib_tables[i]) -+ continue; -+ fib_hash_destroy(ve->_fib_tables[i]); -+ } -+ fib_rules_destroy(); -+ ve->_local_rule = NULL; -+#else -+ fib_hash_destroy(ve->_local_table); -+ fib_hash_destroy(ve->_main_table); -+#endif -+ fib_hash_free(ve->_fib_info_hash, ve->_fib_hash_size); -+ fib_hash_free(ve->_fib_info_laddrhash, ve->_fib_hash_size); -+ ve->_fib_info_hash = ve->_fib_info_laddrhash = NULL; -+} -+ -+EXPORT_SYMBOL(init_ve_route); -+EXPORT_SYMBOL(fini_ve_route); -+#endif -+ - - static inline int - fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, -@@ -766,7 +837,7 @@ static int fn_hash_dump(struct fib_table - return skb->len; - } - --#ifdef CONFIG_IP_MULTIPLE_TABLES -+#if defined(CONFIG_IP_MULTIPLE_TABLES) || defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) - struct fib_table * fib_hash_init(int id) - #else - struct fib_table * __init fib_hash_init(int id) -@@ -1076,13 +1147,13 @@ static struct file_operations fib_seq_fo - - int __init fib_proc_init(void) - { -- if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops)) -+ if (!proc_glob_fops_create("net/route", S_IRUGO, &fib_seq_fops)) - return -ENOMEM; - return 0; - } - - void __init fib_proc_exit(void) - { -- proc_net_remove("route"); -+ remove_proc_glob_entry("net/route", NULL); - } - #endif /* CONFIG_PROC_FS */ -diff -upr linux-2.6.16.orig/net/ipv4/fib_lookup.h linux-2.6.16-026test009/net/ipv4/fib_lookup.h ---- linux-2.6.16.orig/net/ipv4/fib_lookup.h 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/fib_lookup.h 2006-04-19 15:02:12.000000000 +0400 -@@ -41,5 +41,6 @@ extern struct fib_alias *fib_find_alias( - extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, - int *last_idx, int *dflt); -+void fib_hash_free(struct hlist_head *hash, int bytes); - - #endif /* _FIB_LOOKUP_H */ -diff -upr linux-2.6.16.orig/net/ipv4/fib_rules.c linux-2.6.16-026test009/net/ipv4/fib_rules.c ---- linux-2.6.16.orig/net/ipv4/fib_rules.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/fib_rules.c 2006-04-19 15:02:12.000000000 +0400 -@@ -39,6 +39,7 @@ - #include <linux/proc_fs.h> - #include <linux/skbuff.h> - #include <linux/netlink.h> -+#include <linux/rtnetlink.h> - #include <linux/init.h> - - #include <net/ip.h> -@@ -99,9 +100,87 @@ static struct fib_rule local_rule = { - .r_action = RTN_UNICAST, - }; - --static struct fib_rule *fib_rules = &local_rule; - static DEFINE_RWLOCK(fib_rules_lock); - -+void __init prepare_fib_rules(void) -+{ -+#ifdef CONFIG_VE -+ get_ve0()->_local_rule = &local_rule; -+ get_ve0()->_fib_rules = &local_rule; -+#endif -+} -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define local_rule (*(get_exec_env()->_local_rule)) -+#define fib_rules (get_exec_env()->_fib_rules) -+#else -+static struct fib_rule *fib_rules = &local_rule; -+#endif -+ -+#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) -+int fib_rules_create() -+{ -+ struct fib_rule *default_rule, *main_rule, *loc_rule; -+ -+ default_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); -+ if (default_rule == NULL) -+ goto out_def; -+ memset(default_rule, 0, sizeof(struct fib_rule)); -+ atomic_set(&default_rule->r_clntref, 1); -+ default_rule->r_preference = 0x7FFF; -+ default_rule->r_table = RT_TABLE_DEFAULT; -+ default_rule->r_action = RTN_UNICAST; -+ -+ main_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); -+ if (main_rule == NULL) -+ goto out_main; -+ memset(main_rule, 0, sizeof(struct fib_rule)); -+ atomic_set(&main_rule->r_clntref, 1); -+ main_rule->r_preference = 0x7FFE; -+ main_rule->r_table = RT_TABLE_MAIN; -+ main_rule->r_action = RTN_UNICAST; -+ main_rule->r_next = default_rule; -+ -+ loc_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); -+ if (loc_rule == NULL) -+ goto out_loc; -+ memset(loc_rule, 0, sizeof(struct fib_rule)); -+ atomic_set(&loc_rule->r_clntref, 1); -+ loc_rule->r_preference = 0; -+ loc_rule->r_table = RT_TABLE_LOCAL; -+ loc_rule->r_action = RTN_UNICAST; -+ loc_rule->r_next = main_rule; -+ -+ get_exec_env()->_local_rule = loc_rule; -+ get_exec_env()->_fib_rules = loc_rule; -+ -+ return 0; -+ -+out_loc: -+ kfree(main_rule); -+out_main: -+ kfree(default_rule); -+out_def: -+ return -1; -+} -+ -+void fib_rules_destroy() -+{ -+ struct fib_rule *r; -+ -+ rtnl_lock(); -+ write_lock_bh(&fib_rules_lock); -+ while(fib_rules != NULL) { -+ r = fib_rules; -+ fib_rules = fib_rules->r_next; -+ r->r_dead = 1; -+ fib_rule_put(r); -+ } -+ write_unlock_bh(&fib_rules_lock); -+ rtnl_unlock(); -+} -+#endif -+ - int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) - { - struct rtattr **rta = arg; -@@ -435,5 +514,6 @@ int inet_dump_rules(struct sk_buff *skb, - - void __init fib_rules_init(void) - { -+ prepare_fib_rules(); - register_netdevice_notifier(&fib_rules_notifier); - } -diff -upr linux-2.6.16.orig/net/ipv4/fib_semantics.c linux-2.6.16-026test009/net/ipv4/fib_semantics.c ---- linux-2.6.16.orig/net/ipv4/fib_semantics.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/fib_semantics.c 2006-04-19 15:02:12.000000000 +0400 -@@ -33,6 +33,7 @@ - #include <linux/netdevice.h> - #include <linux/if_arp.h> - #include <linux/proc_fs.h> -+#include <linux/ve.h> - #include <linux/skbuff.h> - #include <linux/netlink.h> - #include <linux/init.h> -@@ -56,6 +57,24 @@ static struct hlist_head *fib_info_laddr - static unsigned int fib_hash_size; - static unsigned int fib_info_cnt; - -+void prepare_fib_info(void) -+{ -+#ifdef CONFIG_VE -+ get_ve0()->_fib_info_hash = fib_info_hash; -+ get_ve0()->_fib_info_laddrhash = fib_info_laddrhash; -+ get_ve0()->_fib_hash_size = fib_hash_size; -+ get_ve0()->_fib_info_cnt = fib_info_cnt; -+#endif -+} -+ -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+#define fib_info_hash (get_exec_env()->_fib_info_hash) -+#define fib_info_laddrhash (get_exec_env()->_fib_info_laddrhash) -+#define fib_hash_size (get_exec_env()->_fib_hash_size) -+#define fib_info_cnt (get_exec_env()->_fib_info_cnt) -+#endif -+ -+ - #define DEVINDEX_HASHBITS 8 - #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) - static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; -@@ -235,13 +254,15 @@ static struct fib_info *fib_find_info(co - return NULL; - } - --static inline unsigned int fib_devindex_hashfn(unsigned int val) -+static inline unsigned int fib_devindex_hashfn(unsigned int val, -+ envid_t veid) - { - unsigned int mask = DEVINDEX_HASHSIZE - 1; - - return (val ^ - (val >> DEVINDEX_HASHBITS) ^ -- (val >> (DEVINDEX_HASHBITS * 2))) & mask; -+ (val >> (DEVINDEX_HASHBITS * 2)) ^ -+ (veid ^ (veid >> 16))) & mask; - } - - /* Check, that the gateway is already configured. -@@ -257,7 +278,7 @@ int ip_fib_check_default(u32 gw, struct - - read_lock(&fib_info_lock); - -- hash = fib_devindex_hashfn(dev->ifindex); -+ hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); - head = &fib_info_devhash[hash]; - hlist_for_each_entry(nh, node, head, nh_hash) { - if (nh->nh_dev == dev && -@@ -580,7 +601,7 @@ static struct hlist_head *fib_hash_alloc - __get_free_pages(GFP_KERNEL, get_order(bytes)); - } - --static void fib_hash_free(struct hlist_head *hash, int bytes) -+void fib_hash_free(struct hlist_head *hash, int bytes) - { - if (!hash) - return; -@@ -837,7 +858,8 @@ link_it: - - if (!nh->nh_dev) - continue; -- hash = fib_devindex_hashfn(nh->nh_dev->ifindex); -+ hash = fib_devindex_hashfn(nh->nh_dev->ifindex, -+ VEID(nh->nh_dev->owner_env)); - head = &fib_info_devhash[hash]; - hlist_add_head(&nh->nh_hash, head); - } endfor_nexthops(fi) -@@ -1184,7 +1206,8 @@ int fib_sync_down(u32 local, struct net_ - - if (dev) { - struct fib_info *prev_fi = NULL; -- unsigned int hash = fib_devindex_hashfn(dev->ifindex); -+ unsigned int hash = fib_devindex_hashfn(dev->ifindex, -+ VEID(dev->owner_env)); - struct hlist_head *head = &fib_info_devhash[hash]; - struct hlist_node *node; - struct fib_nh *nh; -@@ -1249,7 +1272,7 @@ int fib_sync_up(struct net_device *dev) - return 0; - - prev_fi = NULL; -- hash = fib_devindex_hashfn(dev->ifindex); -+ hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); - head = &fib_info_devhash[hash]; - ret = 0; - -diff -upr linux-2.6.16.orig/net/ipv4/fib_trie.c linux-2.6.16-026test009/net/ipv4/fib_trie.c ---- linux-2.6.16.orig/net/ipv4/fib_trie.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/fib_trie.c 2006-04-19 15:02:11.000000000 +0400 -@@ -314,11 +314,6 @@ static void __leaf_free_rcu(struct rcu_h - kfree(container_of(head, struct leaf, rcu)); - } - --static inline void free_leaf(struct leaf *leaf) --{ -- call_rcu(&leaf->rcu, __leaf_free_rcu); --} -- - static void __leaf_info_free_rcu(struct rcu_head *head) - { - kfree(container_of(head, struct leaf_info, rcu)); -@@ -357,7 +352,12 @@ static void __tnode_free_rcu(struct rcu_ - - static inline void tnode_free(struct tnode *tn) - { -- call_rcu(&tn->rcu, __tnode_free_rcu); -+ if(IS_LEAF(tn)) { -+ struct leaf *l = (struct leaf *) tn; -+ call_rcu_bh(&l->rcu, __leaf_free_rcu); -+ } -+ else -+ call_rcu(&tn->rcu, __tnode_free_rcu); - } - - static struct leaf *leaf_new(void) -diff -upr linux-2.6.16.orig/net/ipv4/inet_connection_sock.c linux-2.6.16-026test009/net/ipv4/inet_connection_sock.c ---- linux-2.6.16.orig/net/ipv4/inet_connection_sock.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/inet_connection_sock.c 2006-04-19 15:02:12.000000000 +0400 -@@ -25,6 +25,9 @@ - #include <net/tcp_states.h> - #include <net/xfrm.h> - -+#include <ub/ub_net.h> -+#include <ub/ub_orphan.h> -+ - #ifdef INET_CSK_DEBUG - const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; - EXPORT_SYMBOL(inet_csk_timer_bug_msg); -@@ -48,6 +51,7 @@ int inet_csk_bind_conflict(const struct - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - !inet_v6_ipv6only(sk2) && -+ !ve_accessible_strict(VE_OWNER_SK(sk), VE_OWNER_SK(sk2)) && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { -@@ -77,7 +81,9 @@ int inet_csk_get_port(struct inet_hashin - struct hlist_node *node; - struct inet_bind_bucket *tb; - int ret; -+ struct ve_struct *env; - -+ env = VE_OWNER_SK(sk); - local_bh_disable(); - if (!snum) { - int low = sysctl_local_port_range[0]; -@@ -86,11 +92,15 @@ int inet_csk_get_port(struct inet_hashin - int rover = net_random() % (high - low) + low; - - do { -- head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; -+ head = &hashinfo->bhash[inet_bhashfn(rover, -+ hashinfo->bhash_size, VEID(env))]; - spin_lock(&head->lock); -- inet_bind_bucket_for_each(tb, node, &head->chain) -+ inet_bind_bucket_for_each(tb, node, &head->chain) { -+ if (!ve_accessible_strict(VE_OWNER_TB(tb),env)) -+ continue; - if (tb->port == rover) - goto next; -+ } - break; - next: - spin_unlock(&head->lock); -@@ -113,11 +123,15 @@ int inet_csk_get_port(struct inet_hashin - */ - snum = rover; - } else { -- head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; -+ head = &hashinfo->bhash[inet_bhashfn(snum, -+ hashinfo->bhash_size, VEID(env))]; - spin_lock(&head->lock); -- inet_bind_bucket_for_each(tb, node, &head->chain) -+ inet_bind_bucket_for_each(tb, node, &head->chain) { -+ if (!ve_accessible_strict(VE_OWNER_TB(tb), env)) -+ continue; - if (tb->port == snum) - goto tb_found; -+ } - } - tb = NULL; - goto tb_not_found; -@@ -136,7 +150,7 @@ tb_found: - } - tb_not_found: - ret = 1; -- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) -+ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum, env)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) -@@ -541,7 +555,7 @@ void inet_csk_destroy_sock(struct sock * - - sk_refcnt_debug_release(sk); - -- atomic_dec(sk->sk_prot->orphan_count); -+ ub_dec_orphan_count(sk); - sock_put(sk); - } - -@@ -621,7 +635,7 @@ void inet_csk_listen_stop(struct sock *s - - sock_orphan(child); - -- atomic_inc(sk->sk_prot->orphan_count); -+ ub_inc_orphan_count(sk); - - inet_csk_destroy_sock(child); - -diff -upr linux-2.6.16.orig/net/ipv4/inet_diag.c linux-2.6.16-026test009/net/ipv4/inet_diag.c ---- linux-2.6.16.orig/net/ipv4/inet_diag.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/inet_diag.c 2006-04-19 15:02:12.000000000 +0400 -@@ -673,7 +673,9 @@ static int inet_diag_dump(struct sk_buff - struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - const struct inet_diag_handler *handler; - struct inet_hashinfo *hashinfo; -+ struct ve_struct *ve; - -+ ve = get_exec_env(); - handler = inet_diag_table[cb->nlh->nlmsg_type]; - BUG_ON(handler == NULL); - hashinfo = handler->idiag_hashinfo; -@@ -694,6 +696,8 @@ static int inet_diag_dump(struct sk_buff - sk_for_each(sk, node, &hashinfo->listening_hash[i]) { - struct inet_sock *inet = inet_sk(sk); - -+ if (!ve_accessible(VE_OWNER_SK(sk), ve)) -+ continue; - if (num < s_num) { - num++; - continue; -@@ -754,6 +758,8 @@ skip_listen_ht: - sk_for_each(sk, node, &head->chain) { - struct inet_sock *inet = inet_sk(sk); - -+ if (!ve_accessible(VE_OWNER_SK(sk), ve)) -+ continue; - if (num < s_num) - goto next_normal; - if (!(r->idiag_states & (1 << sk->sk_state))) -@@ -778,6 +784,8 @@ next_normal: - inet_twsk_for_each(tw, node, - &hashinfo->ehash[i + hashinfo->ehash_size].chain) { - -+ if (!ve_accessible_veid(inet_twsk(sk)->tw_owner_env, VEID(ve))) -+ continue; - if (num < s_num) - goto next_dying; - if (r->id.idiag_sport != tw->tw_sport && -diff -upr linux-2.6.16.orig/net/ipv4/inet_hashtables.c linux-2.6.16-026test009/net/ipv4/inet_hashtables.c ---- linux-2.6.16.orig/net/ipv4/inet_hashtables.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/inet_hashtables.c 2006-04-19 15:02:12.000000000 +0400 -@@ -30,7 +30,8 @@ - */ - struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, - struct inet_bind_hashbucket *head, -- const unsigned short snum) -+ const unsigned short snum, -+ struct ve_struct *ve) - { - struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); - -@@ -38,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucke - tb->port = snum; - tb->fastreuse = 0; - INIT_HLIST_HEAD(&tb->owners); -+ SET_VE_OWNER_TB(tb, ve); - hlist_add_head(&tb->node, &head->chain); - } - return tb; -@@ -71,10 +73,13 @@ EXPORT_SYMBOL(inet_bind_hash); - */ - static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) - { -- const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); -- struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; -+ int bhash; -+ struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - -+ bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size, -+ VEID(VE_OWNER_SK(sk))); -+ head = &hashinfo->bhash[bhash]; - spin_lock(&head->lock); - tb = inet_csk(sk)->icsk_bind_hash; - __sk_del_bind_node(sk); -@@ -130,7 +135,8 @@ EXPORT_SYMBOL(inet_listen_wlock); - * wildcarded during the search since they can never be otherwise. - */ - struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, -- const unsigned short hnum, const int dif) -+ const unsigned short hnum, const int dif, -+ struct ve_struct *env) - { - struct sock *result = NULL, *sk; - const struct hlist_node *node; -@@ -139,6 +145,8 @@ struct sock *__inet_lookup_listener(cons - sk_for_each(sk, node, head) { - const struct inet_sock *inet = inet_sk(sk); - -+ if (!ve_accessible_strict(VE_OWNER_SK(sk), env)) -+ continue; - if (inet->num == hnum && !ipv6_only_sock(sk)) { - const __u32 rcv_saddr = inet->rcv_saddr; - int score = sk->sk_family == PF_INET ? 1 : 0; -@@ -169,7 +177,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_listener - /* called with local bh disabled */ - static int __inet_check_established(struct inet_timewait_death_row *death_row, - struct sock *sk, __u16 lport, -- struct inet_timewait_sock **twp) -+ struct inet_timewait_sock **twp, -+ struct ve_struct *ve) - { - struct inet_hashinfo *hinfo = death_row->hashinfo; - struct inet_sock *inet = inet_sk(sk); -@@ -178,12 +187,15 @@ static int __inet_check_established(stru - int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); -- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); -- struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); -+ unsigned int hash; -+ struct inet_ehash_bucket *head; - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - -+ hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(ve)); -+ head = inet_ehash_bucket(hinfo, hash); -+ - prefetch(head->chain.first); - write_lock(&head->lock); - -@@ -191,7 +203,8 @@ static int __inet_check_established(stru - sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { - tw = inet_twsk(sk2); - -- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { -+ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, -+ ports, dif, ve)) { - if (twsk_unique(sk, sk2, twp)) - goto unique; - else -@@ -202,7 +215,8 @@ static int __inet_check_established(stru - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { -- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) -+ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, -+ ports, dif, ve)) - goto not_unique; - } - -@@ -253,7 +267,9 @@ int inet_hash_connect(struct inet_timewa - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; -+ struct ve_struct *ve; - -+ ve = VE_OWNER_SK(sk); - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; -@@ -268,7 +284,8 @@ int inet_hash_connect(struct inet_timewa - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; -- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; -+ head = &hinfo->bhash[inet_bhashfn(port, -+ hinfo->bhash_size, VEID(ve))]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, -@@ -282,13 +299,14 @@ int inet_hash_connect(struct inet_timewa - goto next_port; - if (!__inet_check_established(death_row, - sk, port, -- &tw)) -+ &tw, ve)) - goto ok; - goto next_port; - } - } - -- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); -+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, -+ head, port, ve); - if (!tb) { - spin_unlock(&head->lock); - break; -@@ -323,7 +341,7 @@ ok: - goto out; - } - -- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; -+ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { -@@ -333,7 +351,7 @@ ok: - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ -- ret = __inet_check_established(death_row, sk, snum, NULL); -+ ret = __inet_check_established(death_row, sk, snum, NULL, ve); - out: - local_bh_enable(); - return ret; -diff -upr linux-2.6.16.orig/net/ipv4/inet_timewait_sock.c linux-2.6.16-026test009/net/ipv4/inet_timewait_sock.c ---- linux-2.6.16.orig/net/ipv4/inet_timewait_sock.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/inet_timewait_sock.c 2006-04-19 15:02:12.000000000 +0400 -@@ -32,7 +32,8 @@ void __inet_twsk_kill(struct inet_timewa - write_unlock(&ehead->lock); - - /* Disassociate with bind bucket. */ -- bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; -+ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, -+ hashinfo->bhash_size, tw->tw_owner_env)]; - spin_lock(&bhead->lock); - tb = tw->tw_tb; - __hlist_del(&tw->tw_bind_node); -@@ -66,7 +67,8 @@ void __inet_twsk_hashdance(struct inet_t - Note, that any socket with inet->num != 0 MUST be bound in - binding cache, even if it is closed. - */ -- bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; -+ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, -+ hashinfo->bhash_size, tw->tw_owner_env)]; - spin_lock(&bhead->lock); - tw->tw_tb = icsk->icsk_bind_hash; - BUG_TRAP(icsk->icsk_bind_hash); -@@ -90,9 +92,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance) - - struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) - { -- struct inet_timewait_sock *tw = -- kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, -- SLAB_ATOMIC); -+ struct user_beancounter *ub; -+ struct inet_timewait_sock *tw; -+ -+ ub = set_exec_ub(sock_bc(sk)->ub); -+ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, -+ SLAB_ATOMIC); -+ (void)set_exec_ub(ub); -+ - if (tw != NULL) { - const struct inet_sock *inet = inet_sk(sk); - -diff -upr linux-2.6.16.orig/net/ipv4/ip_forward.c linux-2.6.16-026test009/net/ipv4/ip_forward.c ---- linux-2.6.16.orig/net/ipv4/ip_forward.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/ip_forward.c 2006-04-19 15:02:12.000000000 +0400 -@@ -87,6 +87,24 @@ int ip_forward(struct sk_buff *skb) - if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) - goto sr_failed; - -+ /* -+ * We try to optimize forwarding of VE packets: -+ * do not decrement TTL (and so save skb_cow) -+ * during forwarding of outgoing pkts from VE. -+ * For incoming pkts we still do ttl decr, -+ * since such skb is not cloned and does not require -+ * actual cow. So, there is at least one place -+ * in pkts path with mandatory ttl decr, that is -+ * sufficient to prevent routing loops. -+ */ -+ iph = skb->nh.iph; -+ if ( -+#ifdef CONFIG_IP_ROUTE_NAT -+ (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */ -+#endif /* and */ -+ (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */ -+ goto no_ttl_decr; -+ - /* We are about to mangle packet. Copy it! */ - if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) - goto drop; -@@ -95,6 +113,8 @@ int ip_forward(struct sk_buff *skb) - /* Decrease ttl after skb cow done */ - ip_decrease_ttl(iph); - -+no_ttl_decr: -+ - /* - * We now generate an ICMP HOST REDIRECT giving the route - * we calculated. -diff -upr linux-2.6.16.orig/net/ipv4/ip_fragment.c linux-2.6.16-026test009/net/ipv4/ip_fragment.c ---- linux-2.6.16.orig/net/ipv4/ip_fragment.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/ip_fragment.c 2006-04-19 15:02:12.000000000 +0400 -@@ -44,6 +44,7 @@ - #include <linux/udp.h> - #include <linux/inet.h> - #include <linux/netfilter_ipv4.h> -+#include <linux/ve_owner.h> - - /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 - * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c -@@ -97,8 +98,12 @@ struct ipq { - int iif; - unsigned int rid; - struct inet_peer *peer; -+ struct ve_struct *owner_env; - }; - -+DCL_VE_OWNER_PROTO(IPQ, struct ipq, owner_env) -+DCL_VE_OWNER(IPQ, struct ipq, owner_env) -+ - /* Hash table. */ - - #define IPQ_HASHSZ 64 -@@ -182,7 +187,8 @@ static __inline__ void frag_free_queue(s - - static __inline__ struct ipq *frag_alloc_queue(void) - { -- struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); -+ struct ipq *qp = kmalloc(sizeof(struct ipq) + sizeof(void *), -+ GFP_ATOMIC); - - if(!qp) - return NULL; -@@ -278,6 +284,9 @@ static void ip_evictor(void) - static void ip_expire(unsigned long arg) - { - struct ipq *qp = (struct ipq *) arg; -+ struct ve_struct *envid; -+ -+ envid = set_exec_env(VE_OWNER_IPQ(qp)); - - spin_lock(&qp->lock); - -@@ -300,6 +309,8 @@ static void ip_expire(unsigned long arg) - out: - spin_unlock(&qp->lock); - ipq_put(qp, NULL); -+ -+ (void)set_exec_env(envid); - } - - /* Creation primitives. */ -@@ -321,7 +332,8 @@ static struct ipq *ip_frag_intern(unsign - qp->saddr == qp_in->saddr && - qp->daddr == qp_in->daddr && - qp->protocol == qp_in->protocol && -- qp->user == qp_in->user) { -+ qp->user == qp_in->user && -+ qp->owner_env == get_exec_env()) { - atomic_inc(&qp->refcnt); - write_unlock(&ipfrag_lock); - qp_in->last_in |= COMPLETE; -@@ -371,6 +383,8 @@ static struct ipq *ip_frag_create(unsign - spin_lock_init(&qp->lock); - atomic_set(&qp->refcnt, 1); - -+ SET_VE_OWNER_IPQ(qp, get_exec_env()); -+ - return ip_frag_intern(hash, qp); - - out_nomem: -@@ -397,7 +411,8 @@ static inline struct ipq *ip_find(struct - qp->saddr == saddr && - qp->daddr == daddr && - qp->protocol == protocol && -- qp->user == user) { -+ qp->user == user && -+ qp->owner_env == get_exec_env()) { - atomic_inc(&qp->refcnt); - read_unlock(&ipfrag_lock); - return qp; -@@ -719,6 +734,9 @@ struct sk_buff *ip_defrag(struct sk_buff - qp->meat == qp->len) - ret = ip_frag_reasm(qp, dev); - -+ if (ret) -+ SET_VE_OWNER_SKB(ret, VE_OWNER_SKB(skb)); -+ - spin_unlock(&qp->lock); - ipq_put(qp, NULL); - return ret; -@@ -729,6 +747,51 @@ struct sk_buff *ip_defrag(struct sk_buff - return NULL; - } - -+#ifdef CONFIG_VE -+/* XXX */ -+void ip_fragment_cleanup(struct ve_struct *envid) -+{ -+ int i, progress; -+ -+ /* All operations with fragment queues are performed from NET_RX/TX -+ * soft interrupts or from timer context. --Den */ -+ local_bh_disable(); -+ do { -+ progress = 0; -+ for (i = 0; i < IPQ_HASHSZ; i++) { -+ struct ipq *qp; -+ struct hlist_node *p, *n; -+ -+ if (hlist_empty(&ipq_hash[i])) -+ continue; -+inner_restart: -+ read_lock(&ipfrag_lock); -+ hlist_for_each_entry_safe(qp, p, n, -+ &ipq_hash[i], list) { -+ if (!ve_accessible_strict( -+ VE_OWNER_IPQ(qp), -+ envid)) -+ continue; -+ atomic_inc(&qp->refcnt); -+ read_unlock(&ipfrag_lock); -+ -+ spin_lock(&qp->lock); -+ if (!(qp->last_in&COMPLETE)) -+ ipq_kill(qp); -+ spin_unlock(&qp->lock); -+ -+ ipq_put(qp, NULL); -+ progress = 1; -+ goto inner_restart; -+ } -+ read_unlock(&ipfrag_lock); -+ } -+ } while(progress); -+ local_bh_enable(); -+} -+EXPORT_SYMBOL(ip_fragment_cleanup); -+#endif -+ - void ipfrag_init(void) - { - ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ -diff -upr linux-2.6.16.orig/net/ipv4/ip_output.c linux-2.6.16-026test009/net/ipv4/ip_output.c ---- linux-2.6.16.orig/net/ipv4/ip_output.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/ip_output.c 2006-04-19 15:02:11.000000000 +0400 -@@ -86,8 +86,6 @@ - - int sysctl_ip_default_ttl = IPDEFTTL; - --static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)); -- - /* Generate a checksum for an outgoing IP datagram. */ - __inline__ void ip_send_check(struct iphdr *iph) - { -@@ -421,7 +419,7 @@ static void ip_copy_metadata(struct sk_b - * single device frame, and queue such a frame for sending. - */ - --static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) -+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) - { - struct iphdr *iph; - int raw = 0; -@@ -673,6 +671,8 @@ fail: - return err; - } - -+EXPORT_SYMBOL(ip_fragment); -+ - int - ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) - { -@@ -1249,11 +1249,7 @@ int ip_push_pending_frames(struct sock * - iph->tos = inet->tos; - iph->tot_len = htons(skb->len); - iph->frag_off = df; -- if (!df) { -- __ip_select_ident(iph, &rt->u.dst, 0); -- } else { -- iph->id = htons(inet->id++); -- } -+ ip_select_ident(iph, &rt->u.dst, sk); - iph->ttl = ttl; - iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; -diff -upr linux-2.6.16.orig/net/ipv4/ipmr.c linux-2.6.16-026test009/net/ipv4/ipmr.c ---- linux-2.6.16.orig/net/ipv4/ipmr.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/ipmr.c 2006-04-19 15:02:12.000000000 +0400 -@@ -837,7 +837,7 @@ static void mrtsock_destruct(struct sock - { - rtnl_lock(); - if (sk == mroute_socket) { -- ipv4_devconf.mc_forwarding--; -+ ve_ipv4_devconf.mc_forwarding--; - - write_lock_bh(&mrt_lock); - mroute_socket=NULL; -@@ -888,7 +888,7 @@ int ip_mroute_setsockopt(struct sock *sk - mroute_socket=sk; - write_unlock_bh(&mrt_lock); - -- ipv4_devconf.mc_forwarding++; -+ ve_ipv4_devconf.mc_forwarding++; - } - rtnl_unlock(); - return ret; -diff -upr linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_conn.c linux-2.6.16-026test009/net/ipv4/ipvs/ip_vs_conn.c ---- linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_conn.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/ipvs/ip_vs_conn.c 2006-04-19 15:02:11.000000000 +0400 -@@ -902,7 +902,8 @@ int ip_vs_conn_init(void) - /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, -- SLAB_HWCACHE_ALIGN, NULL, NULL); -+ SLAB_HWCACHE_ALIGN | SLAB_UBC, -+ NULL, NULL); - if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); - return -ENOMEM; -diff -upr linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_core.c linux-2.6.16-026test009/net/ipv4/ipvs/ip_vs_core.c ---- linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_core.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/ipvs/ip_vs_core.c 2006-04-19 15:02:12.000000000 +0400 -@@ -952,6 +952,10 @@ ip_vs_in(unsigned int hooknum, struct sk - * Big tappo: only PACKET_HOST (neither loopback nor mcasts) - * ... don't know why 1st test DOES NOT include 2nd (?) - */ -+ /* -+ * VZ: the question above is right. -+ * The second test is superfluous. -+ */ - if (unlikely(skb->pkt_type != PACKET_HOST - || skb->dev == &loopback_dev || skb->sk)) { - IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_core.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_core.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_core.c 2006-04-19 15:02:12.000000000 +0400 -@@ -49,6 +49,7 @@ - #include <linux/netfilter_ipv4/ip_conntrack_helper.h> - #include <linux/netfilter_ipv4/ip_conntrack_core.h> - #include <linux/netfilter_ipv4/listhelp.h> -+#include <ub/ub_mem.h> - - #define IP_CONNTRACK_VERSION "2.4" - -@@ -60,22 +61,41 @@ - - DEFINE_RWLOCK(ip_conntrack_lock); - --/* ip_conntrack_standalone needs this */ --atomic_t ip_conntrack_count = ATOMIC_INIT(0); -+#ifdef CONFIG_VE_IPTABLES -+#define ve_ip_conntrack_helpers \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_helpers) -+#define ve_ip_conntrack_max \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_max) -+#define ve_ip_conntrack_count \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_count) -+#define ve_ip_conntrack_unconfirmed \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_unconfirmed) -+#else - - void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; - LIST_HEAD(ip_conntrack_expect_list); - struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; - static LIST_HEAD(helpers); -+struct list_head *ip_conntrack_hash; -+static LIST_HEAD(unconfirmed); -+#define ve_ip_conntrack_count ip_conntrack_count -+#define ve_ip_conntrack_helpers helpers -+#define ve_ip_conntrack_max ip_conntrack_max -+#define ve_ip_conntrack_unconfirmed unconfirmed -+#endif -+ -+/* ip_conntrack_standalone needs this */ -+atomic_t ip_conntrack_count = ATOMIC_INIT(0); -+ - unsigned int ip_conntrack_htable_size = 0; - int ip_conntrack_max; --struct list_head *ip_conntrack_hash; - static kmem_cache_t *ip_conntrack_cachep __read_mostly; - static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; - struct ip_conntrack ip_conntrack_untracked; - unsigned int ip_ct_log_invalid; --static LIST_HEAD(unconfirmed); -+#ifndef CONFIG_VE - static int ip_conntrack_vmalloc; -+#endif - - static unsigned int ip_conntrack_next_id = 1; - static unsigned int ip_conntrack_expect_next_id = 1; -@@ -105,6 +125,9 @@ void ip_ct_deliver_cached_events(const s - { - struct ip_conntrack_ecache *ecache; - -+ if (!ve_is_super(get_exec_env())) -+ return; -+ - local_bh_disable(); - ecache = &__get_cpu_var(ip_conntrack_ecache); - if (ecache->ct == ct) -@@ -133,6 +156,9 @@ static void ip_ct_event_cache_flush(void - struct ip_conntrack_ecache *ecache; - int cpu; - -+ if (!ve_is_super(get_exec_env())) -+ return; -+ - for_each_cpu(cpu) { - ecache = &per_cpu(ip_conntrack_ecache, cpu); - if (ecache->ct) -@@ -226,7 +252,7 @@ __ip_conntrack_expect_find(const struct - { - struct ip_conntrack_expect *i; - -- list_for_each_entry(i, &ip_conntrack_expect_list, list) { -+ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { - if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { - atomic_inc(&i->use); - return i; -@@ -255,7 +281,7 @@ find_expectation(const struct ip_conntra - { - struct ip_conntrack_expect *i; - -- list_for_each_entry(i, &ip_conntrack_expect_list, list) { -+ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { - /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if -@@ -284,7 +310,7 @@ void ip_ct_remove_expectations(struct ip - if (ct->expecting == 0) - return; - -- list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { -+ list_for_each_entry_safe(i, tmp, &ve_ip_conntrack_expect_list, list) { - if (i->master == ct && del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - ip_conntrack_expect_put(i); -@@ -302,8 +328,10 @@ clean_from_lists(struct ip_conntrack *ct - - ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); -- LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); -- LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); -+ LIST_DELETE(&ve_ip_conntrack_hash[ho], -+ &ct->tuplehash[IP_CT_DIR_ORIGINAL]); -+ LIST_DELETE(&ve_ip_conntrack_hash[hr], -+ &ct->tuplehash[IP_CT_DIR_REPLY]); - - /* Destroy all pending expectations */ - ip_ct_remove_expectations(ct); -@@ -329,8 +357,8 @@ destroy_conntrack(struct nf_conntrack *n - if (proto && proto->destroy) - proto->destroy(ct); - -- if (ip_conntrack_destroyed) -- ip_conntrack_destroyed(ct); -+ if (ve_ip_conntrack_destroyed) -+ ve_ip_conntrack_destroyed(ct); - - write_lock_bh(&ip_conntrack_lock); - /* Expectations will have been removed in clean_from_lists, -@@ -358,7 +386,11 @@ destroy_conntrack(struct nf_conntrack *n - static void death_by_timeout(unsigned long ul_conntrack) - { - struct ip_conntrack *ct = (void *)ul_conntrack; -+#ifdef CONFIG_VE_IPTABLES -+ struct ve_struct *old; - -+ old = set_exec_env(VE_OWNER_CT(ct)); -+#endif - write_lock_bh(&ip_conntrack_lock); - /* Inside lock so preempt is disabled on module removal path. - * Otherwise we can get spurious warnings. */ -@@ -366,6 +398,9 @@ static void death_by_timeout(unsigned lo - clean_from_lists(ct); - write_unlock_bh(&ip_conntrack_lock); - ip_conntrack_put(ct); -+#ifdef CONFIG_VE_IPTABLES -+ (void)set_exec_env(old); -+#endif - } - - static inline int -@@ -386,7 +421,7 @@ __ip_conntrack_find(const struct ip_conn - unsigned int hash = hash_conntrack(tuple); - - ASSERT_READ_LOCK(&ip_conntrack_lock); -- list_for_each_entry(h, &ip_conntrack_hash[hash], list) { -+ list_for_each_entry(h, &ve_ip_conntrack_hash[hash], list) { - if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { - CONNTRACK_STAT_INC(found); - return h; -@@ -418,9 +453,9 @@ static void __ip_conntrack_hash_insert(s - unsigned int repl_hash) - { - ct->id = ++ip_conntrack_next_id; -- list_prepend(&ip_conntrack_hash[hash], -+ list_prepend(&ve_ip_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); -- list_prepend(&ip_conntrack_hash[repl_hash], -+ list_prepend(&ve_ip_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY].list); - } - -@@ -471,11 +506,11 @@ __ip_conntrack_confirm(struct sk_buff ** - /* See if there's one in the list already, including reverse: - NAT could have grabbed it without realizing, since we're - not in the hash. If there is, we lost race. */ -- if (!LIST_FIND(&ip_conntrack_hash[hash], -+ if (!LIST_FIND(&ve_ip_conntrack_hash[hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) -- && !LIST_FIND(&ip_conntrack_hash[repl_hash], -+ && !LIST_FIND(&ve_ip_conntrack_hash[repl_hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { -@@ -569,7 +604,7 @@ static inline int helper_cmp(const struc - static struct ip_conntrack_helper * - __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) - { -- return LIST_FIND(&helpers, helper_cmp, -+ return LIST_FIND(&ve_ip_conntrack_helpers, helper_cmp, - struct ip_conntrack_helper *, - tuple); - } -@@ -605,7 +640,7 @@ void ip_conntrack_helper_put(struct ip_c - struct ip_conntrack_protocol * - __ip_conntrack_proto_find(u_int8_t protocol) - { -- return ip_ct_protos[protocol]; -+ return ve_ip_ct_protos[protocol]; - } - - /* this is guaranteed to always return a valid protocol helper, since -@@ -632,29 +667,32 @@ void ip_conntrack_proto_put(struct ip_co - } - - struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, -- struct ip_conntrack_tuple *repl) -+ struct ip_conntrack_tuple *repl, struct user_beancounter *ub) - { - struct ip_conntrack *conntrack; -+ struct user_beancounter *old_ub; - - if (!ip_conntrack_hash_rnd_initted) { - get_random_bytes(&ip_conntrack_hash_rnd, 4); - ip_conntrack_hash_rnd_initted = 1; - } - -- if (ip_conntrack_max -- && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { -+ if (ve_ip_conntrack_max -+ && atomic_read(&ve_ip_conntrack_count) >= ve_ip_conntrack_max) { - unsigned int hash = hash_conntrack(orig); - /* Try dropping from this hash chain. */ -- if (!early_drop(&ip_conntrack_hash[hash])) { -+ if (!early_drop(&ve_ip_conntrack_hash[hash])) { - if (net_ratelimit()) -- printk(KERN_WARNING -- "ip_conntrack: table full, dropping" -- " packet.\n"); -+ ve_printk(VE_LOG_BOTH, KERN_WARNING -+ "ip_conntrack: VPS %d: table full, dropping" -+ " packet.\n", VEID(get_exec_env())); - return ERR_PTR(-ENOMEM); - } - } - -+ old_ub = set_exec_ub(ub); - conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); -+ (void)set_exec_ub(old_ub); - if (!conntrack) { - DEBUGP("Can't allocate conntrack.\n"); - return ERR_PTR(-ENOMEM); -@@ -669,8 +707,11 @@ struct ip_conntrack *ip_conntrack_alloc( - init_timer(&conntrack->timeout); - conntrack->timeout.data = (unsigned long)conntrack; - conntrack->timeout.function = death_by_timeout; -+#ifdef CONFIG_VE_IPTABLES -+ SET_VE_OWNER_CT(conntrack, get_exec_env()); -+#endif - -- atomic_inc(&ip_conntrack_count); -+ atomic_inc(&ve_ip_conntrack_count); - - return conntrack; - } -@@ -678,7 +719,7 @@ struct ip_conntrack *ip_conntrack_alloc( - void - ip_conntrack_free(struct ip_conntrack *conntrack) - { -- atomic_dec(&ip_conntrack_count); -+ atomic_dec(&ve_ip_conntrack_count); - kmem_cache_free(ip_conntrack_cachep, conntrack); - } - -@@ -692,13 +733,22 @@ init_conntrack(struct ip_conntrack_tuple - struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - struct ip_conntrack_expect *exp; -+ struct user_beancounter *ub; - - if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - -- conntrack = ip_conntrack_alloc(tuple, &repl_tuple); -+#ifdef CONFIG_USER_RESOURCE -+ if (skb->dev != NULL) /* received skb */ -+ ub = netdev_bc(skb->dev)->exec_ub; -+ else if (skb->sk != NULL) /* sent skb */ -+ ub = sock_bc(skb->sk)->ub; -+ else -+#endif -+ ub = NULL; -+ conntrack = ip_conntrack_alloc(tuple, &repl_tuple, ub); - if (conntrack == NULL || IS_ERR(conntrack)) - return (struct ip_conntrack_tuple_hash *)conntrack; - -@@ -733,7 +783,8 @@ init_conntrack(struct ip_conntrack_tuple - } - - /* Overload tuple linked list to put us in unconfirmed list. */ -- list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); -+ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, -+ &ve_ip_conntrack_unconfirmed); - - write_unlock_bh(&ip_conntrack_lock); - -@@ -925,7 +976,7 @@ void ip_conntrack_unexpect_related(struc - - write_lock_bh(&ip_conntrack_lock); - /* choose the the oldest expectation to evict */ -- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { -+ list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) { - if (expect_matches(i, exp) && del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - write_unlock_bh(&ip_conntrack_lock); -@@ -959,11 +1010,11 @@ void ip_conntrack_expect_put(struct ip_c - kmem_cache_free(ip_conntrack_expect_cachep, exp); - } - --static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) -+void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) - { - atomic_inc(&exp->use); - exp->master->expecting++; -- list_add(&exp->list, &ip_conntrack_expect_list); -+ list_add(&exp->list, &ve_ip_conntrack_expect_list); - - init_timer(&exp->timeout); - exp->timeout.data = (unsigned long)exp; -@@ -975,13 +1026,14 @@ static void ip_conntrack_expect_insert(s - atomic_inc(&exp->use); - CONNTRACK_STAT_INC(expect_create); - } -+EXPORT_SYMBOL_GPL(ip_conntrack_expect_insert); - - /* Race with expectations being used means we could have none to find; OK. */ - static void evict_oldest_expect(struct ip_conntrack *master) - { - struct ip_conntrack_expect *i; - -- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { -+ list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) { - if (i->master == master) { - if (del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); -@@ -1012,7 +1064,7 @@ int ip_conntrack_expect_related(struct i - DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); - - write_lock_bh(&ip_conntrack_lock); -- list_for_each_entry(i, &ip_conntrack_expect_list, list) { -+ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { - if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { -@@ -1060,18 +1112,48 @@ int ip_conntrack_helper_register(struct - { - BUG_ON(me->timeout == 0); - write_lock_bh(&ip_conntrack_lock); -- list_prepend(&helpers, me); -+ list_prepend(&ve_ip_conntrack_helpers, me); - write_unlock_bh(&ip_conntrack_lock); - - return 0; - } - -+int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *me) -+{ -+ int ret; -+ struct module *mod = me->me; -+ -+ if (!ve_is_super(get_exec_env())) { -+ struct ip_conntrack_helper *tmp; -+ __module_get(mod); -+ ret = -ENOMEM; -+ tmp = kmalloc(sizeof(struct ip_conntrack_helper), GFP_KERNEL); -+ if (!tmp) -+ goto nomem; -+ memcpy(tmp, me, sizeof(struct ip_conntrack_helper)); -+ me = tmp; -+ } -+ -+ ret = ip_conntrack_helper_register(me); -+ if (ret) -+ goto out; -+ -+ return 0; -+out: -+ if (!ve_is_super(get_exec_env())){ -+ kfree(me); -+nomem: -+ module_put(mod); -+ } -+ return ret; -+} -+ - struct ip_conntrack_helper * - __ip_conntrack_helper_find_byname(const char *name) - { - struct ip_conntrack_helper *h; - -- list_for_each_entry(h, &helpers, list) { -+ list_for_each_entry(h, &ve_ip_conntrack_helpers, list) { - if (!strcmp(h->name, name)) - return h; - } -@@ -1096,19 +1178,20 @@ void ip_conntrack_helper_unregister(stru - - /* Need write lock here, to delete helper. */ - write_lock_bh(&ip_conntrack_lock); -- LIST_DELETE(&helpers, me); -+ LIST_DELETE(&ve_ip_conntrack_helpers, me); - - /* Get rid of expectations */ -- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { -+ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, list) { - if (exp->master->helper == me && del_timer(&exp->timeout)) { - ip_ct_unlink_expect(exp); - ip_conntrack_expect_put(exp); - } - } - /* Get rid of expecteds, set helpers to NULL. */ -- LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); -+ LIST_FIND_W(&ve_ip_conntrack_unconfirmed, unhelp, -+ struct ip_conntrack_tuple_hash*, me); - for (i = 0; i < ip_conntrack_htable_size; i++) -- LIST_FIND_W(&ip_conntrack_hash[i], unhelp, -+ LIST_FIND_W(&ve_ip_conntrack_hash[i], unhelp, - struct ip_conntrack_tuple_hash *, me); - write_unlock_bh(&ip_conntrack_lock); - -@@ -1116,6 +1199,25 @@ void ip_conntrack_helper_unregister(stru - synchronize_net(); - } - -+void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) -+{ -+ -+ if (!ve_is_super(get_exec_env())) { -+ read_lock_bh(&ip_conntrack_lock); -+ me = list_named_find(&ve_ip_conntrack_helpers, me->name); -+ read_unlock_bh(&ip_conntrack_lock); -+ if (!me) -+ return; -+ } -+ -+ ip_conntrack_helper_unregister(me); -+ -+ if (!ve_is_super(get_exec_env())) { -+ module_put(me->me); -+ kfree(me); -+ } -+} -+ - /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ - void __ip_ct_refresh_acct(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, -@@ -1246,13 +1348,13 @@ get_next_corpse(int (*iter)(struct ip_co - - write_lock_bh(&ip_conntrack_lock); - for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { -- h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, -+ h = LIST_FIND_W(&ve_ip_conntrack_hash[*bucket], do_iter, - struct ip_conntrack_tuple_hash *, iter, data); - if (h) - break; - } - if (!h) -- h = LIST_FIND_W(&unconfirmed, do_iter, -+ h = LIST_FIND_W(&ve_ip_conntrack_unconfirmed, do_iter, - struct ip_conntrack_tuple_hash *, iter, data); - if (h) - atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); -@@ -1289,6 +1391,9 @@ getorigdst(struct sock *sk, int optval, - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_tuple tuple; - -+ if (!get_exec_env()->_ip_conntrack) -+ return -ENOPROTOOPT; -+ - IP_CT_TUPLE_U_BLANK(&tuple); - tuple.src.ip = inet->rcv_saddr; - tuple.src.u.tcp.port = inet->sport; -@@ -1359,12 +1464,17 @@ static void free_conntrack_hash(struct l - get_order(sizeof(struct list_head) * size)); - } - -+static void ip_conntrack_cache_free(void) -+{ -+ kmem_cache_destroy(ip_conntrack_expect_cachep); -+ kmem_cache_destroy(ip_conntrack_cachep); -+ nf_unregister_sockopt(&so_getorigdst); -+} -+ - /* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ - void ip_conntrack_cleanup(void) - { -- ip_ct_attach = NULL; -- - /* This makes sure all current packets have passed through - netfilter framework. Roll on, two-stage module - delete... */ -@@ -1373,19 +1483,32 @@ void ip_conntrack_cleanup(void) - ip_ct_event_cache_flush(); - i_see_dead_people: - ip_conntrack_flush(); -- if (atomic_read(&ip_conntrack_count) != 0) { -+ if (atomic_read(&ve_ip_conntrack_count) != 0) { - schedule(); - goto i_see_dead_people; - } -- /* wait until all references to ip_conntrack_untracked are dropped */ -- while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) -- schedule(); -- -- kmem_cache_destroy(ip_conntrack_cachep); -- kmem_cache_destroy(ip_conntrack_expect_cachep); -- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, -+ if (ve_is_super(get_exec_env())) { -+ /* wait until all references to ip_conntrack_untracked are -+ * dropped */ -+ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) -+ schedule(); -+ ip_ct_attach = NULL; -+ ip_conntrack_cache_free(); -+ } -+ free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc, - ip_conntrack_htable_size); -- nf_unregister_sockopt(&so_getorigdst); -+ ve_ip_conntrack_hash = NULL; -+ INIT_LIST_HEAD(&ve_ip_conntrack_unconfirmed); -+ INIT_LIST_HEAD(&ve_ip_conntrack_expect_list); -+ INIT_LIST_HEAD(&ve_ip_conntrack_helpers); -+ atomic_set(&ve_ip_conntrack_count, 0); -+ ve_ip_conntrack_max = 0; -+#ifdef CONFIG_VE_IPTABLES -+ kfree(ve_ip_ct_protos); -+ ve_ip_ct_protos = NULL; -+ kfree(get_exec_env()->_ip_conntrack); -+ get_exec_env()->_ip_conntrack = NULL; -+#endif - } - - static struct list_head *alloc_hashtable(int size, int *vmalloced) -@@ -1394,13 +1517,13 @@ static struct list_head *alloc_hashtable - unsigned int i; - - *vmalloced = 0; -- hash = (void*)__get_free_pages(GFP_KERNEL, -+ hash = (void*)__get_free_pages(GFP_KERNEL_UBC, - get_order(sizeof(struct list_head) - * size)); - if (!hash) { - *vmalloced = 1; - printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); -- hash = vmalloc(sizeof(struct list_head) * size); -+ hash = ub_vmalloc(sizeof(struct list_head) * size); - } - - if (hash) -@@ -1436,8 +1559,8 @@ static int set_hashsize(const char *val, - - write_lock_bh(&ip_conntrack_lock); - for (i = 0; i < ip_conntrack_htable_size; i++) { -- while (!list_empty(&ip_conntrack_hash[i])) { -- h = list_entry(ip_conntrack_hash[i].next, -+ while (!list_empty(&ve_ip_conntrack_hash[i])) { -+ h = list_entry(ve_ip_conntrack_hash[i].next, - struct ip_conntrack_tuple_hash, list); - list_del(&h->list); - bucket = __hash_conntrack(&h->tuple, hashsize, rnd); -@@ -1445,12 +1568,12 @@ static int set_hashsize(const char *val, - } - } - old_size = ip_conntrack_htable_size; -- old_vmalloced = ip_conntrack_vmalloc; -- old_hash = ip_conntrack_hash; -+ old_vmalloced = ve_ip_conntrack_vmalloc; -+ old_hash = ve_ip_conntrack_hash; - - ip_conntrack_htable_size = hashsize; -- ip_conntrack_vmalloc = vmalloced; -- ip_conntrack_hash = hash; -+ ve_ip_conntrack_vmalloc = vmalloced; -+ ve_ip_conntrack_hash = hash; - ip_conntrack_hash_rnd = rnd; - write_unlock_bh(&ip_conntrack_lock); - -@@ -1461,9 +1584,8 @@ static int set_hashsize(const char *val, - module_param_call(hashsize, set_hashsize, param_get_uint, - &ip_conntrack_htable_size, 0600); - --int __init ip_conntrack_init(void) -+static int ip_conntrack_cache_create(void) - { -- unsigned int i; - int ret; - - /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB -@@ -1477,70 +1599,127 @@ int __init ip_conntrack_init(void) - if (ip_conntrack_htable_size < 16) - ip_conntrack_htable_size = 16; - } -- ip_conntrack_max = 8 * ip_conntrack_htable_size; -+ ve_ip_conntrack_max = 8 * ip_conntrack_htable_size; - - printk("ip_conntrack version %s (%u buckets, %d max)" - " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, -- ip_conntrack_htable_size, ip_conntrack_max, -+ ip_conntrack_htable_size, ve_ip_conntrack_max, - sizeof(struct ip_conntrack)); - - ret = nf_register_sockopt(&so_getorigdst); - if (ret != 0) { - printk(KERN_ERR "Unable to register netfilter socket option\n"); -- return ret; -- } -- -- ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, -- &ip_conntrack_vmalloc); -- if (!ip_conntrack_hash) { -- printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); -- goto err_unreg_sockopt; -+ goto out_sockopt; - } - -+ ret = -ENOMEM; - ip_conntrack_cachep = kmem_cache_create("ip_conntrack", - sizeof(struct ip_conntrack), 0, -- 0, NULL, NULL); -+ SLAB_UBC, NULL, NULL); - if (!ip_conntrack_cachep) { - printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); -- goto err_free_hash; -+ goto err_unreg_sockopt; - } - - ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", - sizeof(struct ip_conntrack_expect), -- 0, 0, NULL, NULL); -+ 0, SLAB_UBC, NULL, NULL); - if (!ip_conntrack_expect_cachep) { - printk(KERN_ERR "Unable to create ip_expect slab cache\n"); - goto err_free_conntrack_slab; - } - -+ return 0; -+ -+err_free_conntrack_slab: -+ kmem_cache_destroy(ip_conntrack_cachep); -+err_unreg_sockopt: -+ nf_unregister_sockopt(&so_getorigdst); -+out_sockopt: -+ return ret; -+} -+ -+int ip_conntrack_init(void) -+{ -+ struct ve_struct *env; -+ unsigned int i; -+ int ret; -+ -+ env = get_exec_env(); -+#ifdef CONFIG_VE_IPTABLES -+ ret = -ENOMEM; -+ env->_ip_conntrack = -+ kmalloc(sizeof(struct ve_ip_conntrack), GFP_KERNEL); -+ if (!env->_ip_conntrack) -+ goto out; -+ memset(env->_ip_conntrack, 0, sizeof(struct ve_ip_conntrack)); -+ if (ve_is_super(env)) { -+ ret = ip_conntrack_cache_create(); -+ if (ret) -+ goto cache_fail; -+ } else -+ ve_ip_conntrack_max = 8 * ip_conntrack_htable_size; -+#else /* CONFIG_VE_IPTABLES */ -+ ret = ip_conntrack_cache_create(); -+ if (ret) -+ goto out; -+#endif -+ -+ ret = -ENOMEM; -+ ve_ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, -+ &ve_ip_conntrack_vmalloc); -+ if (!ve_ip_conntrack_hash) { -+ printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); -+ goto err_free_cache; -+ } -+ -+#ifdef CONFIG_VE_IPTABLES -+ ve_ip_ct_protos = (struct ip_conntrack_protocol **) -+ ub_kmalloc(sizeof(void *)*MAX_IP_CT_PROTO, GFP_KERNEL); -+ if (!ve_ip_ct_protos) -+ goto err_free_hash; -+#endif - /* Don't NEED lock here, but good form anyway. */ - write_lock_bh(&ip_conntrack_lock); - for (i = 0; i < MAX_IP_CT_PROTO; i++) -- ip_ct_protos[i] = &ip_conntrack_generic_protocol; -+ ve_ip_ct_protos[i] = &ip_conntrack_generic_protocol; - /* Sew in builtin protocols. */ -- ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; -- ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; -- ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; -+ ve_ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; -+ ve_ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; -+ ve_ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; - write_unlock_bh(&ip_conntrack_lock); - -- /* For use by ipt_REJECT */ -- ip_ct_attach = ip_conntrack_attach; -- -- /* Set up fake conntrack: -- - to never be deleted, not in any hashes */ -- atomic_set(&ip_conntrack_untracked.ct_general.use, 1); -- /* - and look it like as a confirmed connection */ -- set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); -+ INIT_LIST_HEAD(&ve_ip_conntrack_unconfirmed); -+ INIT_LIST_HEAD(&ve_ip_conntrack_expect_list); -+ INIT_LIST_HEAD(&ve_ip_conntrack_helpers); -+ -+ if (ve_is_super(env)) { -+ /* For use by ipt_REJECT */ -+ ip_ct_attach = ip_conntrack_attach; -+ -+ /* Set up fake conntrack: -+ - to never be deleted, not in any hashes */ -+ atomic_set(&ip_conntrack_untracked.ct_general.use, 1); -+ /* - and look it like as a confirmed connection */ -+ set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); -+ } - -- return ret; -+ return 0; - --err_free_conntrack_slab: -- kmem_cache_destroy(ip_conntrack_cachep); -+#ifdef CONFIG_VE_IPTABLES - err_free_hash: -- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, -+#endif -+ free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc, - ip_conntrack_htable_size); --err_unreg_sockopt: -- nf_unregister_sockopt(&so_getorigdst); -- -- return -ENOMEM; -+ ve_ip_conntrack_hash = NULL; -+err_free_cache: -+ if (ve_is_super(env)) -+ ip_conntrack_cache_free(); -+#ifdef CONFIG_VE_IPTABLES -+cache_fail: -+ kfree(env->_ip_conntrack); -+ env->_ip_conntrack = NULL; -+#endif -+out: -+ return ret; - } -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_ftp.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_ftp.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -15,6 +15,7 @@ - #include <linux/ctype.h> - #include <net/checksum.h> - #include <net/tcp.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter_ipv4/ip_conntrack_helper.h> - #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> -@@ -425,8 +426,8 @@ static int help(struct sk_buff **pskb, - - /* Now, NAT might want to mangle the packet, and register the - * (possibly changed) expectation itself. */ -- if (ip_nat_ftp_hook) -- ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, -+ if (ve_ip_nat_ftp_hook) -+ ret = ve_ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, - matchoff, matchlen, exp, &seq); - else { - /* Can't expect this? Best to drop packet now. */ -@@ -452,16 +453,39 @@ out_update_nl: - static struct ip_conntrack_helper ftp[MAX_PORTS]; - static char ftp_names[MAX_PORTS][sizeof("ftp-65535")]; - --/* Not __exit: called from init() */ --static void fini(void) -+void fini_iptable_ftp(void) - { - int i; - for (i = 0; i < ports_c; i++) { - DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", - ports[i]); -- ip_conntrack_helper_unregister(&ftp[i]); -+ virt_ip_conntrack_helper_unregister(&ftp[i]); - } -+} -+ -+int init_iptable_ftp(void) -+{ -+ int i, ret; - -+ for (i = 0; i < ports_c; i++) { -+ DEBUGP("ip_ct_ftp: registering helper for port %d\n", -+ ports[i]); -+ ret = virt_ip_conntrack_helper_register(&ftp[i]); -+ if (ret) { -+ fini_iptable_ftp(); -+ return ret; -+ } -+ } -+ return 0; -+} -+ -+/* Not __exit: called from init() */ -+static void fini(void) -+{ -+ KSYMMODUNRESOLVE(ip_conntrack_ftp); -+ KSYMUNRESOLVE(init_iptable_ftp); -+ KSYMUNRESOLVE(fini_iptable_ftp); -+ fini_iptable_ftp(); - kfree(ftp_buffer); - } - -@@ -496,13 +520,17 @@ static int __init init(void) - - DEBUGP("ip_ct_ftp: registering helper for port %d\n", - ports[i]); -- ret = ip_conntrack_helper_register(&ftp[i]); -+ ret = virt_ip_conntrack_helper_register(&ftp[i]); - - if (ret) { - fini(); - return ret; - } - } -+ -+ KSYMRESOLVE(init_iptable_ftp); -+ KSYMRESOLVE(fini_iptable_ftp); -+ KSYMMODRESOLVE(ip_conntrack_ftp); - return 0; - } - -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_irc.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_irc.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_irc.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_irc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -28,6 +28,7 @@ - #include <linux/ip.h> - #include <net/checksum.h> - #include <net/tcp.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter_ipv4/ip_conntrack_helper.h> - #include <linux/netfilter_ipv4/ip_conntrack_irc.h> -@@ -244,6 +245,33 @@ static char irc_names[MAX_PORTS][sizeof( - - static void fini(void); - -+void fini_iptable_irc(void) -+{ -+ int i; -+ for (i = 0; i < ports_c; i++) { -+ DEBUGP("unregistering port %d\n", -+ ports[i]); -+ virt_ip_conntrack_helper_unregister(&irc_helpers[i]); -+ } -+} -+ -+int init_iptable_irc(void) -+{ -+ int i, ret; -+ -+ for (i = 0; i < ports_c; i++) { -+ DEBUGP("port #%d: %d\n", i, ports[i]); -+ ret = virt_ip_conntrack_helper_register(&irc_helpers[i]); -+ if (ret) { -+ printk("ip_conntrack_irc: ERROR registering port %d\n", -+ ports[i]); -+ fini_iptable_irc(); -+ return -EBUSY; -+ } -+ } -+ return 0; -+} -+ - static int __init init(void) - { - int i, ret; -@@ -283,7 +311,7 @@ static int __init init(void) - - DEBUGP("port #%d: %d\n", i, ports[i]); - -- ret = ip_conntrack_helper_register(hlpr); -+ ret = virt_ip_conntrack_helper_register(hlpr); - - if (ret) { - printk("ip_conntrack_irc: ERROR registering port %d\n", -@@ -292,6 +320,10 @@ static int __init init(void) - return -EBUSY; - } - } -+ -+ KSYMRESOLVE(init_iptable_irc); -+ KSYMRESOLVE(fini_iptable_irc); -+ KSYMMODRESOLVE(ip_conntrack_irc); - return 0; - } - -@@ -299,12 +331,10 @@ static int __init init(void) - * it is needed by the init function */ - static void fini(void) - { -- int i; -- for (i = 0; i < ports_c; i++) { -- DEBUGP("unregistering port %d\n", -- ports[i]); -- ip_conntrack_helper_unregister(&irc_helpers[i]); -- } -+ KSYMMODUNRESOLVE(ip_conntrack_irc); -+ KSYMUNRESOLVE(init_iptable_irc); -+ KSYMUNRESOLVE(fini_iptable_irc); -+ fini_iptable_irc(); - kfree(irc_buffer); - } - -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_netlink.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_netlink.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-04-19 15:02:12.000000000 +0400 -@@ -29,6 +29,7 @@ - #include <linux/spinlock.h> - #include <linux/interrupt.h> - #include <linux/notifier.h> -+#include <net/sock.h> - - #include <linux/netfilter.h> - #include <linux/netfilter_ipv4/ip_conntrack.h> -@@ -39,6 +40,8 @@ - - #include <linux/netfilter/nfnetlink.h> - #include <linux/netfilter/nfnetlink_conntrack.h> -+#include <ub/beancounter.h> -+#include <ub/ub_sk.h> - - MODULE_LICENSE("GPL"); - -@@ -403,7 +406,7 @@ ctnetlink_dump_table(struct sk_buff *skb - - read_lock_bh(&ip_conntrack_lock); - for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { -- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { -+ list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) { - h = (struct ip_conntrack_tuple_hash *) i; - if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) - continue; -@@ -440,7 +443,7 @@ ctnetlink_dump_table_w(struct sk_buff *s - - write_lock_bh(&ip_conntrack_lock); - for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { -- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { -+ list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) { - h = (struct ip_conntrack_tuple_hash *) i; - if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) - continue; -@@ -1003,14 +1006,15 @@ ctnetlink_change_conntrack(struct ip_con - static int - ctnetlink_create_conntrack(struct nfattr *cda[], - struct ip_conntrack_tuple *otuple, -- struct ip_conntrack_tuple *rtuple) -+ struct ip_conntrack_tuple *rtuple, -+ struct user_beancounter *ub) - { - struct ip_conntrack *ct; - int err = -EINVAL; - - DEBUGP("entered %s\n", __FUNCTION__); - -- ct = ip_conntrack_alloc(otuple, rtuple); -+ ct = ip_conntrack_alloc(otuple, rtuple, ub); - if (ct == NULL || IS_ERR(ct)) - return -ENOMEM; - -@@ -1087,8 +1091,16 @@ ctnetlink_new_conntrack(struct sock *ctn - write_unlock_bh(&ip_conntrack_lock); - DEBUGP("no such conntrack, create new\n"); - err = -ENOENT; -- if (nlh->nlmsg_flags & NLM_F_CREATE) -- err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); -+ if (nlh->nlmsg_flags & NLM_F_CREATE) { -+#ifdef CONFIG_USER_RESOURCE -+ if (skb->sk) -+ err = ctnetlink_create_conntrack(cda, &otuple, -+ &rtuple, sock_bc(skb->sk)->ub); -+ else -+#endif -+ err = ctnetlink_create_conntrack(cda, -+ &otuple, &rtuple, NULL); -+ } - return err; - } - /* implicit 'else' */ -@@ -1249,7 +1261,7 @@ ctnetlink_exp_dump_table(struct sk_buff - DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); - - read_lock_bh(&ip_conntrack_lock); -- list_for_each_prev(i, &ip_conntrack_expect_list) { -+ list_for_each_prev(i, &ve_ip_conntrack_expect_list) { - exp = (struct ip_conntrack_expect *) i; - if (exp->id <= *id) - continue; -@@ -1395,7 +1407,7 @@ ctnetlink_del_expect(struct sock *ctnl, - write_unlock_bh(&ip_conntrack_lock); - return -EINVAL; - } -- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, -+ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, - list) { - if (exp->master->helper == h - && del_timer(&exp->timeout)) { -@@ -1407,7 +1419,7 @@ ctnetlink_del_expect(struct sock *ctnl, - } else { - /* This basically means we have to flush everything*/ - write_lock_bh(&ip_conntrack_lock); -- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, -+ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, - list) { - if (del_timer(&exp->timeout)) { - ip_ct_unlink_expect(exp); -@@ -1619,7 +1631,7 @@ static void __exit ctnetlink_exit(void) - printk("ctnetlink: unregistering from nfnetlink.\n"); - - #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -- ip_conntrack_unregister_notifier(&ctnl_notifier_exp); -+ ip_conntrack_expect_unregister_notifier(&ctnl_notifier_exp); - ip_conntrack_unregister_notifier(&ctnl_notifier); - #endif - -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_proto_generic.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-04-19 15:02:12.000000000 +0400 -@@ -52,7 +52,7 @@ static int packet(struct ip_conntrack *c - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) - { -- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); -+ ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_generic_timeout); - return NF_ACCEPT; - } - -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_proto_icmp.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -104,7 +104,7 @@ static int icmp_packet(struct ip_conntra - } else { - atomic_inc(&ct->proto.icmp.count); - ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); -- ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); -+ ip_ct_refresh_acct(ct, ctinfo, skb, ve_ip_ct_icmp_timeout); - } - - return NF_ACCEPT; -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_proto_tcp.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -98,7 +98,7 @@ unsigned int ip_ct_tcp_timeout_close = - to ~13-30min depending on RTO. */ - unsigned int ip_ct_tcp_timeout_max_retrans = 5 MINS; - --static const unsigned int * tcp_timeouts[] -+const unsigned int * tcp_timeouts[] - = { NULL, /* TCP_CONNTRACK_NONE */ - &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ - &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ -@@ -762,7 +762,7 @@ static int tcp_in_window(struct ip_ct_tc - : "SEQ is under the lower bound (already ACKed data retransmitted)" - : "SEQ is over the upper bound (over the window of the receiver)"); - -- res = ip_ct_tcp_be_liberal; -+ res = ve_ip_ct_tcp_be_liberal; - } - - DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " -@@ -1033,9 +1033,11 @@ static int tcp_packet(struct ip_conntrac - && (new_state == TCP_CONNTRACK_FIN_WAIT - || new_state == TCP_CONNTRACK_CLOSE)) - conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; -- timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans -- && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans -- ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; -+ timeout = conntrack->proto.tcp.retrans >= ve_ip_ct_tcp_max_retrans && -+ ve_ip_ct_tcp_timeouts[new_state] > -+ ve_ip_ct_tcp_timeout_max_retrans -+ ? ve_ip_ct_tcp_timeout_max_retrans : -+ ve_ip_ct_tcp_timeouts[new_state]; - write_unlock_bh(&tcp_lock); - - ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); -@@ -1110,7 +1112,7 @@ static int tcp_new(struct ip_conntrack * - conntrack->proto.tcp.seen[1].flags = 0; - conntrack->proto.tcp.seen[0].loose = - conntrack->proto.tcp.seen[1].loose = 0; -- } else if (ip_ct_tcp_loose == 0) { -+ } else if (ve_ip_ct_tcp_loose == 0) { - /* Don't try to pick up connections. */ - return 0; - } else { -@@ -1134,7 +1136,7 @@ static int tcp_new(struct ip_conntrack * - conntrack->proto.tcp.seen[0].flags = - conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; - conntrack->proto.tcp.seen[0].loose = -- conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; -+ conntrack->proto.tcp.seen[1].loose = ve_ip_ct_tcp_loose; - } - - conntrack->proto.tcp.seen[1].td_end = 0; -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_proto_udp.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -71,12 +71,12 @@ static int udp_packet(struct ip_conntrac - stream. Extend timeout. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { - ip_ct_refresh_acct(conntrack, ctinfo, skb, -- ip_ct_udp_timeout_stream); -+ ve_ip_ct_udp_timeout_stream); - /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) - ip_conntrack_event_cache(IPCT_STATUS, skb); - } else -- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); -+ ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_udp_timeout); - - return NF_ACCEPT; - } -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_standalone.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_standalone.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-04-19 15:02:12.000000000 +0400 -@@ -28,6 +28,7 @@ - #include <net/checksum.h> - #include <net/ip.h> - #include <net/route.h> -+#include <linux/nfcalls.h> - - #define ASSERT_READ_LOCK(x) - #define ASSERT_WRITE_LOCK(x) -@@ -46,9 +47,31 @@ - - MODULE_LICENSE("GPL"); - -+int ip_conntrack_disable_ve0 = 0; -+module_param(ip_conntrack_disable_ve0, int, 0440); -+ - extern atomic_t ip_conntrack_count; -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_ip_conntrack_count \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_count) -+#else -+#define ve_ip_conntrack_count ip_conntrack_count -+#endif - DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); - -+/* Prior to 2.6.15, we had a ip_conntrack_enable_ve0 param. */ -+static int warn_set(const char *val, struct kernel_param *kp) -+{ -+ printk(KERN_INFO KBUILD_MODNAME -+ ": parameter ip_conntrack_enable_ve0 is obsoleted. In ovzkernel" -+ " >= 2.6.15 connection tracking on hardware node is enabled by " -+ "default, use ip_conntrack_disable_ve0=1 parameter to " -+ "disable.\n"); -+ return 0; -+} -+module_param_call(ip_conntrack_enable_ve0, warn_set, NULL, NULL, 0); -+ - static int kill_proto(struct ip_conntrack *i, void *data) - { - return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == -@@ -89,8 +112,8 @@ static struct list_head *ct_get_first(st - for (st->bucket = 0; - st->bucket < ip_conntrack_htable_size; - st->bucket++) { -- if (!list_empty(&ip_conntrack_hash[st->bucket])) -- return ip_conntrack_hash[st->bucket].next; -+ if (!list_empty(&ve_ip_conntrack_hash[st->bucket])) -+ return ve_ip_conntrack_hash[st->bucket].next; - } - return NULL; - } -@@ -100,10 +123,10 @@ static struct list_head *ct_get_next(str - struct ct_iter_state *st = seq->private; - - head = head->next; -- while (head == &ip_conntrack_hash[st->bucket]) { -+ while (head == &ve_ip_conntrack_hash[st->bucket]) { - if (++st->bucket >= ip_conntrack_htable_size) - return NULL; -- head = ip_conntrack_hash[st->bucket].next; -+ head = ve_ip_conntrack_hash[st->bucket].next; - } - return head; - } -@@ -234,7 +257,7 @@ static struct file_operations ct_file_op - /* expects */ - static void *exp_seq_start(struct seq_file *s, loff_t *pos) - { -- struct list_head *e = &ip_conntrack_expect_list; -+ struct list_head *e = &ve_ip_conntrack_expect_list; - loff_t i; - - /* strange seq_file api calls stop even if we fail, -@@ -246,7 +269,7 @@ static void *exp_seq_start(struct seq_fi - - for (i = 0; i <= *pos; i++) { - e = e->next; -- if (e == &ip_conntrack_expect_list) -+ if (e == &ve_ip_conntrack_expect_list) - return NULL; - } - return e; -@@ -259,7 +282,7 @@ static void *exp_seq_next(struct seq_fil - ++*pos; - e = e->next; - -- if (e == &ip_conntrack_expect_list) -+ if (e == &ve_ip_conntrack_expect_list) - return NULL; - - return e; -@@ -344,7 +367,7 @@ static void ct_cpu_seq_stop(struct seq_f - - static int ct_cpu_seq_show(struct seq_file *seq, void *v) - { -- unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); -+ unsigned int nr_conntracks = atomic_read(&ve_ip_conntrack_count); - struct ip_conntrack_stat *st = v; - - if (v == SEQ_START_TOKEN) { -@@ -541,6 +564,28 @@ static struct nf_hook_ops ip_conntrack_l - - /* From ip_conntrack_core.c */ - extern int ip_conntrack_max; -+#ifdef CONFIG_VE_IPTABLES -+#define ve_ip_conntrack_max \ -+ (get_exec_env()->_ip_conntrack->_ip_conntrack_max) -+#define ve_ip_ct_sysctl_header \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_header) -+#define ve_ip_ct_net_table \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_net_table) -+#define ve_ip_ct_ipv4_table \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_ipv4_table) -+#define ve_ip_ct_netfilter_table \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_netfilter_table) -+#define ve_ip_ct_sysctl_table \ -+ (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_table) -+#else -+#define ve_ip_conntrack_max ip_conntrack_max -+static struct ctl_table_header *ip_ct_sysctl_header; -+#define ve_ip_ct_sysctl_header ip_ct_sysctl_header -+#define ve_ip_ct_net_table ip_ct_net_table -+#define ve_ip_ct_ipv4_table ip_ct_ipv4_table -+#define ve_ip_ct_netfilter_table ip_ct_netfilter_table -+#define ve_ip_ct_sysctl_table ip_ct_sysctl_table -+#endif - extern unsigned int ip_conntrack_htable_size; - - /* From ip_conntrack_proto_tcp.c */ -@@ -571,8 +616,6 @@ extern unsigned int ip_ct_generic_timeou - static int log_invalid_proto_min = 0; - static int log_invalid_proto_max = 255; - --static struct ctl_table_header *ip_ct_sysctl_header; -- - static ctl_table ip_ct_sysctl_table[] = { - { - .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, -@@ -781,6 +824,112 @@ static ctl_table ip_ct_net_table[] = { - }; - - EXPORT_SYMBOL(ip_ct_log_invalid); -+ -+#ifdef CONFIG_VE -+static void ip_conntrack_sysctl_cleanup(void) -+{ -+ if (!ve_is_super(get_exec_env())) { -+ kfree(ve_ip_ct_net_table); -+ kfree(ve_ip_ct_ipv4_table); -+ kfree(ve_ip_ct_netfilter_table); -+ kfree(ve_ip_ct_sysctl_table); -+ } -+ ve_ip_ct_net_table = NULL; -+ ve_ip_ct_ipv4_table = NULL; -+ ve_ip_ct_netfilter_table = NULL; -+ ve_ip_ct_sysctl_table = NULL; -+} -+ -+#define ALLOC_ENVCTL(field,k,label) \ -+ if ( !(field = kmalloc(k*sizeof(ctl_table), GFP_KERNEL)) ) \ -+ goto label; -+static int ip_conntrack_sysctl_init(void) -+{ -+ int i, ret = 0; -+ -+ ret = -ENOMEM; -+ if (ve_is_super(get_exec_env())) { -+ ve_ip_ct_net_table = ip_ct_net_table; -+ ve_ip_ct_ipv4_table = ip_ct_ipv4_table; -+ ve_ip_ct_netfilter_table = ip_ct_netfilter_table; -+ ve_ip_ct_sysctl_table = ip_ct_sysctl_table; -+ } else { -+ /* allocate structures in ve_struct */ -+ ALLOC_ENVCTL(ve_ip_ct_net_table, 2, out); -+ ALLOC_ENVCTL(ve_ip_ct_ipv4_table, 2, nomem_1); -+ ALLOC_ENVCTL(ve_ip_ct_netfilter_table, 3, nomem_2); -+ ALLOC_ENVCTL(ve_ip_ct_sysctl_table, 15, nomem_3); -+ -+ memcpy(ve_ip_ct_net_table, ip_ct_net_table, -+ 2*sizeof(ctl_table)); -+ memcpy(ve_ip_ct_ipv4_table, ip_ct_ipv4_table, -+ 2*sizeof(ctl_table)); -+ memcpy(ve_ip_ct_netfilter_table, ip_ct_netfilter_table, -+ 3*sizeof(ctl_table)); -+ memcpy(ve_ip_ct_sysctl_table, ip_ct_sysctl_table, -+ 21*sizeof(ctl_table)); -+ -+ ve_ip_ct_net_table[0].child = ve_ip_ct_ipv4_table; -+ ve_ip_ct_ipv4_table[0].child = ve_ip_ct_netfilter_table; -+ ve_ip_ct_netfilter_table[0].child = ve_ip_ct_sysctl_table; -+ } -+ ve_ip_ct_sysctl_table[0].data = &ve_ip_conntrack_max; -+ ve_ip_ct_netfilter_table[1].data = &ve_ip_conntrack_max; -+ ve_ip_ct_sysctl_table[1].data = &ve_ip_conntrack_count; -+ /* skip ve_ip_ct_sysctl_table[2].data as it is read-only and common -+ * for all environments */ -+ ve_ip_ct_tcp_timeouts[1] = ip_ct_tcp_timeout_syn_sent; -+ ve_ip_ct_sysctl_table[3].data = &ve_ip_ct_tcp_timeouts[1]; -+ ve_ip_ct_tcp_timeouts[2] = ip_ct_tcp_timeout_syn_recv; -+ ve_ip_ct_sysctl_table[4].data = &ve_ip_ct_tcp_timeouts[2]; -+ ve_ip_ct_tcp_timeouts[3] = ip_ct_tcp_timeout_established; -+ ve_ip_ct_sysctl_table[5].data = &ve_ip_ct_tcp_timeouts[3]; -+ ve_ip_ct_tcp_timeouts[4] = ip_ct_tcp_timeout_fin_wait; -+ ve_ip_ct_sysctl_table[6].data = &ve_ip_ct_tcp_timeouts[4]; -+ ve_ip_ct_tcp_timeouts[5] = ip_ct_tcp_timeout_close_wait; -+ ve_ip_ct_sysctl_table[7].data = &ve_ip_ct_tcp_timeouts[5]; -+ ve_ip_ct_tcp_timeouts[6] = ip_ct_tcp_timeout_last_ack; -+ ve_ip_ct_sysctl_table[8].data = &ve_ip_ct_tcp_timeouts[6]; -+ ve_ip_ct_tcp_timeouts[7] = ip_ct_tcp_timeout_time_wait; -+ ve_ip_ct_sysctl_table[9].data = &ve_ip_ct_tcp_timeouts[7]; -+ ve_ip_ct_tcp_timeouts[8] = ip_ct_tcp_timeout_close; -+ ve_ip_ct_sysctl_table[10].data = &ve_ip_ct_tcp_timeouts[8]; -+ ve_ip_ct_udp_timeout = ip_ct_udp_timeout; -+ ve_ip_ct_sysctl_table[11].data = &ve_ip_ct_udp_timeout; -+ ve_ip_ct_udp_timeout_stream = ip_ct_udp_timeout_stream; -+ ve_ip_ct_sysctl_table[12].data = &ve_ip_ct_udp_timeout_stream; -+ ve_ip_ct_icmp_timeout = ip_ct_icmp_timeout; -+ ve_ip_ct_sysctl_table[13].data = &ve_ip_ct_icmp_timeout; -+ ve_ip_ct_generic_timeout = ip_ct_generic_timeout; -+ ve_ip_ct_sysctl_table[14].data = &ve_ip_ct_generic_timeout; -+ ve_ip_ct_log_invalid = ip_ct_log_invalid; -+ ve_ip_ct_sysctl_table[15].data = &ve_ip_ct_log_invalid; -+ ve_ip_ct_tcp_timeout_max_retrans = ip_ct_tcp_timeout_max_retrans; -+ ve_ip_ct_sysctl_table[16].data = &ve_ip_ct_tcp_timeout_max_retrans; -+ ve_ip_ct_tcp_loose = ip_ct_tcp_loose; -+ ve_ip_ct_sysctl_table[17].data = &ve_ip_ct_tcp_loose; -+ ve_ip_ct_tcp_be_liberal = ip_ct_tcp_be_liberal; -+ ve_ip_ct_sysctl_table[18].data = &ve_ip_ct_tcp_be_liberal; -+ ve_ip_ct_tcp_max_retrans = ip_ct_tcp_max_retrans; -+ ve_ip_ct_sysctl_table[19].data = &ve_ip_ct_tcp_max_retrans; -+ for (i = 0; i < 20; i++) -+ ve_ip_ct_sysctl_table[i].owner_env = get_exec_env(); -+ ve_ip_ct_netfilter_table[1].owner_env = get_exec_env(); -+ return 0; -+ -+nomem_3: -+ kfree(ve_ip_ct_netfilter_table); -+ ve_ip_ct_netfilter_table = NULL; -+nomem_2: -+ kfree(ve_ip_ct_ipv4_table); -+ ve_ip_ct_ipv4_table = NULL; -+nomem_1: -+ kfree(ve_ip_ct_net_table); -+ ve_ip_ct_net_table = NULL; -+out: -+ return ret; -+} -+#endif /*CONFIG_VE*/ - #endif /* CONFIG_SYSCTL */ - - static int init_or_cleanup(int init) -@@ -792,9 +941,16 @@ static int init_or_cleanup(int init) - - if (!init) goto cleanup; - -+ ret = -ENOENT; -+ if (!ve_is_super(get_exec_env())) -+ __module_get(THIS_MODULE); -+ - ret = ip_conntrack_init(); - if (ret < 0) -- goto cleanup_nothing; -+ goto cleanup_unget; -+ -+ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) -+ return 0; - - #ifdef CONFIG_PROC_FS - ret = -ENOMEM; -@@ -804,98 +960,115 @@ static int init_or_cleanup(int init) - proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, - &exp_file_ops); - if (!proc_exp) goto cleanup_proc; -+ proc_exp->proc_fops = &exp_file_ops; - -- proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); -- if (!proc_stat) -- goto cleanup_proc_exp; -+ if (ve_is_super(get_exec_env())) { -+ proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); -+ if (!proc_stat) -+ goto cleanup_proc_exp; - -- proc_stat->proc_fops = &ct_cpu_seq_fops; -- proc_stat->owner = THIS_MODULE; -+ proc_stat->proc_fops = &ct_cpu_seq_fops; -+ proc_stat->owner = THIS_MODULE; -+ } - #endif - -- ret = nf_register_hook(&ip_conntrack_defrag_ops); -+ ret = virt_nf_register_hook(&ip_conntrack_defrag_ops); - if (ret < 0) { - printk("ip_conntrack: can't register pre-routing defrag hook.\n"); - goto cleanup_proc_stat; - } -- ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops); -+ ret = virt_nf_register_hook(&ip_conntrack_defrag_local_out_ops); - if (ret < 0) { - printk("ip_conntrack: can't register local_out defrag hook.\n"); - goto cleanup_defragops; - } -- ret = nf_register_hook(&ip_conntrack_in_ops); -+ ret = virt_nf_register_hook(&ip_conntrack_in_ops); - if (ret < 0) { - printk("ip_conntrack: can't register pre-routing hook.\n"); - goto cleanup_defraglocalops; - } -- ret = nf_register_hook(&ip_conntrack_local_out_ops); -+ ret = virt_nf_register_hook(&ip_conntrack_local_out_ops); - if (ret < 0) { - printk("ip_conntrack: can't register local out hook.\n"); - goto cleanup_inops; - } -- ret = nf_register_hook(&ip_conntrack_helper_in_ops); -+ ret = virt_nf_register_hook(&ip_conntrack_helper_in_ops); - if (ret < 0) { - printk("ip_conntrack: can't register local in helper hook.\n"); - goto cleanup_inandlocalops; - } -- ret = nf_register_hook(&ip_conntrack_helper_out_ops); -+ ret = virt_nf_register_hook(&ip_conntrack_helper_out_ops); - if (ret < 0) { - printk("ip_conntrack: can't register postrouting helper hook.\n"); - goto cleanup_helperinops; - } -- ret = nf_register_hook(&ip_conntrack_out_ops); -+ ret = virt_nf_register_hook(&ip_conntrack_out_ops); - if (ret < 0) { - printk("ip_conntrack: can't register post-routing hook.\n"); - goto cleanup_helperoutops; - } -- ret = nf_register_hook(&ip_conntrack_local_in_ops); -+ ret = virt_nf_register_hook(&ip_conntrack_local_in_ops); - if (ret < 0) { - printk("ip_conntrack: can't register local in hook.\n"); - goto cleanup_inoutandlocalops; - } - #ifdef CONFIG_SYSCTL -- ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); -- if (ip_ct_sysctl_header == NULL) { -+#ifdef CONFIG_VE -+ ret = ip_conntrack_sysctl_init(); -+ if (ret < 0) -+ goto cleanup_sysctl; -+#endif -+ ret = -ENOMEM; -+ ve_ip_ct_sysctl_header = register_sysctl_table(ve_ip_ct_net_table, 0); -+ if (ve_ip_ct_sysctl_header == NULL) { - printk("ip_conntrack: can't register to sysctl.\n"); -- ret = -ENOMEM; -- goto cleanup_localinops; -+ goto cleanup_sysctl2; - } - #endif - -- return ret; -+ return 0; - - cleanup: -+ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) -+ goto cleanup_init; - synchronize_net(); - #ifdef CONFIG_SYSCTL -- unregister_sysctl_table(ip_ct_sysctl_header); -- cleanup_localinops: -+ unregister_sysctl_table(ve_ip_ct_sysctl_header); -+ cleanup_sysctl2: -+#ifdef CONFIG_VE -+ ip_conntrack_sysctl_cleanup(); -+ cleanup_sysctl: -+#endif - #endif -- nf_unregister_hook(&ip_conntrack_local_in_ops); -+ virt_nf_unregister_hook(&ip_conntrack_local_in_ops); - cleanup_inoutandlocalops: -- nf_unregister_hook(&ip_conntrack_out_ops); -+ virt_nf_unregister_hook(&ip_conntrack_out_ops); - cleanup_helperoutops: -- nf_unregister_hook(&ip_conntrack_helper_out_ops); -+ virt_nf_unregister_hook(&ip_conntrack_helper_out_ops); - cleanup_helperinops: -- nf_unregister_hook(&ip_conntrack_helper_in_ops); -+ virt_nf_unregister_hook(&ip_conntrack_helper_in_ops); - cleanup_inandlocalops: -- nf_unregister_hook(&ip_conntrack_local_out_ops); -+ virt_nf_unregister_hook(&ip_conntrack_local_out_ops); - cleanup_inops: -- nf_unregister_hook(&ip_conntrack_in_ops); -+ virt_nf_unregister_hook(&ip_conntrack_in_ops); - cleanup_defraglocalops: -- nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); -+ virt_nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); - cleanup_defragops: -- nf_unregister_hook(&ip_conntrack_defrag_ops); -+ virt_nf_unregister_hook(&ip_conntrack_defrag_ops); - cleanup_proc_stat: - #ifdef CONFIG_PROC_FS -- remove_proc_entry("ip_conntrack", proc_net_stat); -+ if (ve_is_super(get_exec_env())) -+ remove_proc_entry("ip_conntrack", proc_net_stat); - cleanup_proc_exp: - proc_net_remove("ip_conntrack_expect"); - cleanup_proc: - proc_net_remove("ip_conntrack"); -- cleanup_init: - #endif /* CONFIG_PROC_FS */ -+ cleanup_init: - ip_conntrack_cleanup(); -- cleanup_nothing: -+ cleanup_unget: -+ if (!ve_is_super(get_exec_env())) -+ module_put(THIS_MODULE); - return ret; - } - -@@ -906,11 +1079,11 @@ int ip_conntrack_protocol_register(struc - int ret = 0; - - write_lock_bh(&ip_conntrack_lock); -- if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { -+ if (ve_ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { - ret = -EBUSY; - goto out; - } -- ip_ct_protos[proto->proto] = proto; -+ ve_ip_ct_protos[proto->proto] = proto; - out: - write_unlock_bh(&ip_conntrack_lock); - return ret; -@@ -919,7 +1092,7 @@ int ip_conntrack_protocol_register(struc - void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) - { - write_lock_bh(&ip_conntrack_lock); -- ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; -+ ve_ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; - write_unlock_bh(&ip_conntrack_lock); - - /* Somebody could be still looking at the proto in bh. */ -@@ -929,17 +1102,39 @@ void ip_conntrack_protocol_unregister(st - ip_ct_iterate_cleanup(kill_proto, &proto->proto); - } - --static int __init init(void) -+int init_iptable_conntrack(void) - { - return init_or_cleanup(1); - } - --static void __exit fini(void) -+void fini_iptable_conntrack(void) - { - init_or_cleanup(0); - } - --module_init(init); -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_conntrack(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_conntrack); -+ KSYMRESOLVE(fini_iptable_conntrack); -+ KSYMMODRESOLVE(ip_conntrack); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ip_conntrack); -+ KSYMUNRESOLVE(init_iptable_conntrack); -+ KSYMUNRESOLVE(fini_iptable_conntrack); -+ fini_iptable_conntrack(); -+} -+ -+subsys_initcall(init); - module_exit(fini); - - /* Some modules need us, but don't depend directly on any symbol. -@@ -956,15 +1151,20 @@ EXPORT_SYMBOL_GPL(ip_conntrack_unregiste - EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); - EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); - #endif -+EXPORT_SYMBOL(ip_conntrack_disable_ve0); - EXPORT_SYMBOL(ip_conntrack_protocol_register); - EXPORT_SYMBOL(ip_conntrack_protocol_unregister); - EXPORT_SYMBOL(ip_ct_get_tuple); - EXPORT_SYMBOL(invert_tuplepr); - EXPORT_SYMBOL(ip_conntrack_alter_reply); -+#ifndef CONFIG_VE_IPTABLES - EXPORT_SYMBOL(ip_conntrack_destroyed); -+#endif - EXPORT_SYMBOL(need_conntrack); - EXPORT_SYMBOL(ip_conntrack_helper_register); - EXPORT_SYMBOL(ip_conntrack_helper_unregister); -+EXPORT_SYMBOL(virt_ip_conntrack_helper_register); -+EXPORT_SYMBOL(virt_ip_conntrack_helper_unregister); - EXPORT_SYMBOL(ip_ct_iterate_cleanup); - EXPORT_SYMBOL(__ip_ct_refresh_acct); - -@@ -974,14 +1174,18 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_expect_ - EXPORT_SYMBOL_GPL(ip_conntrack_expect_find); - EXPORT_SYMBOL(ip_conntrack_expect_related); - EXPORT_SYMBOL(ip_conntrack_unexpect_related); -+#ifndef CONFIG_VE_IPTABLES - EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); -+#endif - EXPORT_SYMBOL_GPL(ip_ct_unlink_expect); - - EXPORT_SYMBOL(ip_conntrack_tuple_taken); - EXPORT_SYMBOL(ip_ct_gather_frags); - EXPORT_SYMBOL(ip_conntrack_htable_size); - EXPORT_SYMBOL(ip_conntrack_lock); -+#ifndef CONFIG_VE_IPTABLES - EXPORT_SYMBOL(ip_conntrack_hash); -+#endif - EXPORT_SYMBOL(ip_conntrack_untracked); - EXPORT_SYMBOL_GPL(ip_conntrack_find_get); - #ifdef CONFIG_IP_NF_NAT_NEEDED -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_core.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_core.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_core.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_core.c 2006-04-19 15:02:12.000000000 +0400 -@@ -21,6 +21,8 @@ - #include <linux/icmp.h> - #include <linux/udp.h> - #include <linux/jhash.h> -+#include <linux/nfcalls.h> -+#include <ub/ub_mem.h> - - #define ASSERT_READ_LOCK(x) - #define ASSERT_WRITE_LOCK(x) -@@ -46,15 +48,24 @@ DEFINE_RWLOCK(ip_nat_lock); - /* Calculated at init based on memory size */ - static unsigned int ip_nat_htable_size; - --static struct list_head *bysource; -- - #define MAX_IP_NAT_PROTO 256 -+ -+#ifdef CONFIG_VE_IPTABLES -+#define ve_ip_nat_bysource \ -+ (get_exec_env()->_ip_conntrack->_ip_nat_bysource) -+#define ve_ip_nat_protos \ -+ (get_exec_env()->_ip_conntrack->_ip_nat_protos) -+#else -+static struct list_head *bysource; -+#define ve_ip_nat_bysource bysource - static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; -+#define ve_ip_nat_protos ip_nat_protos -+#endif - - static inline struct ip_nat_protocol * - __ip_nat_proto_find(u_int8_t protonum) - { -- return ip_nat_protos[protonum]; -+ return ve_ip_nat_protos[protonum]; - } - - struct ip_nat_protocol * -@@ -177,7 +188,7 @@ find_appropriate_src(const struct ip_con - struct ip_conntrack *ct; - - read_lock_bh(&ip_nat_lock); -- list_for_each_entry(ct, &bysource[h], nat.info.bysource) { -+ list_for_each_entry(ct, &ve_ip_nat_bysource[h], nat.info.bysource) { - if (same_src(ct, tuple)) { - /* Copy source part from reply tuple. */ - invert_tuplepr(result, -@@ -291,13 +302,22 @@ get_unique_tuple(struct ip_conntrack_tup - ip_nat_proto_put(proto); - } - -+void ip_nat_hash_conntrack(struct ip_conntrack *conntrack) -+{ -+ unsigned int srchash -+ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple); -+ write_lock_bh(&ip_nat_lock); -+ list_add(&conntrack->nat.info.bysource, &ve_ip_nat_bysource[srchash]); -+ write_unlock_bh(&ip_nat_lock); -+} -+EXPORT_SYMBOL_GPL(ip_nat_hash_conntrack); -+ - unsigned int - ip_nat_setup_info(struct ip_conntrack *conntrack, - const struct ip_nat_range *range, - unsigned int hooknum) - { - struct ip_conntrack_tuple curr_tuple, new_tuple; -- struct ip_nat_info *info = &conntrack->nat.info; - int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK); - enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); - -@@ -332,14 +352,8 @@ ip_nat_setup_info(struct ip_conntrack *c - } - - /* Place in source hash if this is the first time. */ -- if (have_to_hash) { -- unsigned int srchash -- = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] -- .tuple); -- write_lock_bh(&ip_nat_lock); -- list_add(&info->bysource, &bysource[srchash]); -- write_unlock_bh(&ip_nat_lock); -- } -+ if (have_to_hash) -+ ip_nat_hash_conntrack(conntrack); - - /* It's done. */ - if (maniptype == IP_NAT_MANIP_DST) -@@ -521,11 +535,11 @@ int ip_nat_protocol_register(struct ip_n - int ret = 0; - - write_lock_bh(&ip_nat_lock); -- if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { -+ if (ve_ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { - ret = -EBUSY; - goto out; - } -- ip_nat_protos[proto->protonum] = proto; -+ ve_ip_nat_protos[proto->protonum] = proto; - out: - write_unlock_bh(&ip_nat_lock); - return ret; -@@ -536,7 +550,7 @@ EXPORT_SYMBOL(ip_nat_protocol_register); - void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) - { - write_lock_bh(&ip_nat_lock); -- ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; -+ ve_ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; - write_unlock_bh(&ip_nat_lock); - - /* Someone could be still looking at the proto in a bh. */ -@@ -589,38 +603,55 @@ EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_ - EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr); - #endif - --static int __init ip_nat_init(void) -+static int ip_nat_init(void) - { - size_t i; -+ int ret; - -- /* Leave them the same for the moment. */ -- ip_nat_htable_size = ip_conntrack_htable_size; -+ if (ve_is_super(get_exec_env())) -+ ip_nat_htable_size = ip_conntrack_htable_size; - - /* One vmalloc for both hash tables */ -- bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); -- if (!bysource) -- return -ENOMEM; -+ ret = -ENOMEM; -+ ve_ip_nat_bysource = -+ ub_vmalloc(sizeof(struct list_head)*ip_nat_htable_size*2); -+ if (!ve_ip_nat_bysource) -+ goto nomem; -+ -+#ifdef CONFIG_VE_IPTABLES -+ ve_ip_nat_protos = -+ ub_kmalloc(sizeof(void *)*MAX_IP_NAT_PROTO, GFP_KERNEL); -+ if (!ve_ip_nat_protos) -+ goto nomem2; -+#endif - - /* Sew in builtin protocols. */ - write_lock_bh(&ip_nat_lock); - for (i = 0; i < MAX_IP_NAT_PROTO; i++) -- ip_nat_protos[i] = &ip_nat_unknown_protocol; -- ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; -- ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; -- ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; -+ ve_ip_nat_protos[i] = &ip_nat_unknown_protocol; -+ ve_ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; -+ ve_ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; -+ ve_ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; - write_unlock_bh(&ip_nat_lock); - - for (i = 0; i < ip_nat_htable_size; i++) { -- INIT_LIST_HEAD(&bysource[i]); -+ INIT_LIST_HEAD(&ve_ip_nat_bysource[i]); - } - - /* FIXME: Man, this is a hack. <SIGH> */ - IP_NF_ASSERT(ip_conntrack_destroyed == NULL); -- ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; -+ ve_ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; - -- /* Initialize fake conntrack so that NAT will skip it */ -- ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; -+ if (ve_is_super(get_exec_env())) -+ /* Initialize fake conntrack so that NAT will skip it */ -+ ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; - return 0; -+#ifdef CONFIG_VE_IPTABLES -+nomem2: -+#endif -+ vfree(ve_ip_nat_bysource); -+nomem: -+ return ret; - } - - /* Clear NAT section of all conntracks, in case we're loaded again. */ -@@ -631,14 +662,41 @@ static int clean_nat(struct ip_conntrack - return 0; - } - --static void __exit ip_nat_cleanup(void) -+static void ip_nat_cleanup(void) - { - ip_ct_iterate_cleanup(&clean_nat, NULL); -- ip_conntrack_destroyed = NULL; -- vfree(bysource); -+ ve_ip_conntrack_destroyed = NULL; -+ vfree(ve_ip_nat_bysource); -+ ve_ip_nat_bysource = NULL; -+#ifdef CONFIG_VE_IPTABLES -+ kfree(ve_ip_nat_protos); -+ ve_ip_nat_protos = NULL; -+#endif -+} -+ -+static int __init init(void) -+{ -+ int err; -+ -+ err = ip_nat_init(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(ip_nat_init); -+ KSYMRESOLVE(ip_nat_cleanup); -+ KSYMMODRESOLVE(ip_nat); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ip_nat); -+ KSYMUNRESOLVE(ip_nat_cleanup); -+ KSYMUNRESOLVE(ip_nat_init); -+ ip_nat_cleanup(); - } - - MODULE_LICENSE("GPL"); - --module_init(ip_nat_init); --module_exit(ip_nat_cleanup); -+fs_initcall(init); -+module_exit(fini); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_ftp.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_ftp.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_ftp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_ftp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -19,6 +19,7 @@ - #include <linux/netfilter_ipv4/ip_nat_rule.h> - #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> - #include <linux/netfilter_ipv4/ip_conntrack_helper.h> -+#include <linux/nfcalls.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -@@ -154,18 +155,43 @@ static unsigned int ip_nat_ftp(struct sk - return NF_ACCEPT; - } - --static void __exit fini(void) -+#ifdef CONFIG_VE_IPTABLES -+#undef ve_ip_nat_ftp_hook -+#define ve_ip_nat_ftp_hook \ -+ (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook) -+#endif -+int init_iptable_nat_ftp(void) - { -- ip_nat_ftp_hook = NULL; -+ BUG_ON(ve_ip_nat_ftp_hook); -+#ifdef CONFIG_VE_IPTABLES -+ ve_ip_nat_ftp_hook = (ip_nat_helper_func)ip_nat_ftp; -+#else -+ ve_ip_nat_ftp_hook = ip_nat_ftp; -+#endif -+ return 0; -+} -+ -+void fini_iptable_nat_ftp(void) -+{ -+ ve_ip_nat_ftp_hook = NULL; - /* Make sure noone calls it, meanwhile. */ - synchronize_net(); - } - -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ip_nat_ftp); -+ KSYMUNRESOLVE(init_iptable_nat_ftp); -+ KSYMUNRESOLVE(fini_iptable_nat_ftp); -+ fini_iptable_nat_ftp(); -+} -+ - static int __init init(void) - { -- BUG_ON(ip_nat_ftp_hook); -- ip_nat_ftp_hook = ip_nat_ftp; -- return 0; -+ KSYMRESOLVE(init_iptable_nat_ftp); -+ KSYMRESOLVE(fini_iptable_nat_ftp); -+ KSYMMODRESOLVE(ip_nat_ftp); -+ return init_iptable_nat_ftp(); - } - - /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_irc.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_irc.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_irc.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_irc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -23,6 +23,7 @@ - #include <linux/netfilter_ipv4/ip_conntrack_irc.h> - #include <linux/netfilter_ipv4/ip_conntrack_helper.h> - #include <linux/moduleparam.h> -+#include <linux/nfcalls.h> - - #if 0 - #define DEBUGP printk -@@ -96,18 +97,44 @@ static unsigned int help(struct sk_buff - return ret; - } - --static void __exit fini(void) -+#ifdef CONFIG_VE_IPTABLES -+#undef ve_ip_nat_irc_hook -+#define ve_ip_nat_irc_hook \ -+ (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook) -+#endif -+ -+int init_iptable_nat_irc(void) -+{ -+ BUG_ON(ve_ip_nat_irc_hook); -+#ifdef CONFIG_VE_IPTABLES -+ ve_ip_nat_irc_hook = (ip_nat_helper_func)help; -+#else -+ ve_ip_nat_irc_hook = help; -+#endif -+ return 0; -+} -+ -+void fini_iptable_nat_irc(void) - { -- ip_nat_irc_hook = NULL; -+ ve_ip_nat_irc_hook = NULL; - /* Make sure noone calls it, meanwhile. */ - synchronize_net(); - } - -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ip_nat_irc); -+ KSYMUNRESOLVE(init_iptable_nat_irc); -+ KSYMUNRESOLVE(fini_iptable_nat_irc); -+ fini_iptable_nat_irc(); -+} -+ - static int __init init(void) - { -- BUG_ON(ip_nat_irc_hook); -- ip_nat_irc_hook = help; -- return 0; -+ KSYMRESOLVE(init_iptable_nat_irc); -+ KSYMRESOLVE(fini_iptable_nat_irc); -+ KSYMMODRESOLVE(ip_nat_irc); -+ return init_iptable_nat_irc(); - } - - /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_rule.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_rule.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_rule.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_rule.c 2006-04-19 15:02:12.000000000 +0400 -@@ -34,6 +34,13 @@ - #define DEBUGP(format, args...) - #endif - -+#ifdef CONFIG_VE_IPTABLES -+#define ve_ip_nat_table \ -+ (get_exec_env()->_ip_conntrack->_ip_nat_table) -+#else -+#define ve_ip_nat_table &nat_table -+#endif -+ - #define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) - - static struct -@@ -41,7 +48,7 @@ static struct - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; --} nat_initial_table __initdata -+} nat_initial_table - = { { "nat", NAT_VALID_HOOKS, 4, - sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - { [NF_IP_PRE_ROUTING] = 0, -@@ -235,6 +242,93 @@ static int ipt_dnat_checkentry(const cha - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat_to_user(void *target, void **dstptr, -+ int *size, int off) -+{ -+ struct ipt_entry_target *pt; -+ struct ip_nat_multi_range_compat *pinfo; -+ struct compat_ip_nat_multi_range info; -+ u_int16_t tsize; -+ -+ pt = (struct ipt_entry_target *)target; -+ tsize = pt->u.user.target_size; -+ if (__copy_to_user(*dstptr, pt, sizeof(struct ipt_entry_target))) -+ return -EFAULT; -+ pinfo = (struct ip_nat_multi_range_compat *)pt->data; -+ memset(&info, 0, sizeof(struct compat_ip_nat_multi_range)); -+ info.rangesize = pinfo->rangesize; -+ info.range[0].flags = pinfo->range[0].flags; -+ info.range[0].min_ip = pinfo->range[0].min_ip; -+ info.range[0].max_ip = pinfo->range[0].max_ip; -+ info.range[0].min = pinfo->range[0].min; -+ info.range[0].max = pinfo->range[0].max; -+ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_target), -+ &info, sizeof(struct compat_ip_nat_multi_range))) -+ return -EFAULT; -+ tsize -= off; -+ if (put_user(tsize, (u_int16_t *)*dstptr)) -+ return -EFAULT; -+ *size -= off; -+ *dstptr += tsize; -+ return 0; -+} -+ -+static int compat_from_user(void *target, void **dstptr, -+ int *size, int off) -+{ -+ struct compat_ipt_entry_target *pt; -+ struct ipt_entry_target *dstpt; -+ struct compat_ip_nat_multi_range *pinfo; -+ struct ip_nat_multi_range_compat info; -+ u_int16_t tsize; -+ -+ pt = (struct compat_ipt_entry_target *)target; -+ dstpt = (struct ipt_entry_target *)*dstptr; -+ tsize = pt->u.user.target_size; -+ memcpy(*dstptr, pt, sizeof(struct compat_ipt_entry_target)); -+ pinfo = (struct compat_ip_nat_multi_range *)pt->data; -+ memset(&info, 0, sizeof(struct ip_nat_multi_range_compat)); -+ info.rangesize = pinfo->rangesize; -+ info.range[0].flags = pinfo->range[0].flags; -+ info.range[0].min_ip = pinfo->range[0].min_ip; -+ info.range[0].max_ip = pinfo->range[0].max_ip; -+ info.range[0].min = pinfo->range[0].min; -+ info.range[0].max = pinfo->range[0].max; -+ memcpy(*dstptr + sizeof(struct compat_ipt_entry_target), -+ &info, sizeof(struct ip_nat_multi_range_compat)); -+ tsize += off; -+ dstpt->u.user.target_size = tsize; -+ *size += off; -+ *dstptr += tsize; -+ return 0; -+} -+ -+static int compat(void *target, void **dstptr, int *size, int convert) -+{ -+ int ret, off; -+ -+ off = IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat)) - -+ COMPAT_IPT_ALIGN(sizeof(struct compat_ip_nat_multi_range)); -+ switch (convert) { -+ case COMPAT_TO_USER: -+ ret = compat_to_user(target, dstptr, size, off); -+ break; -+ case COMPAT_FROM_USER: -+ ret = compat_from_user(target, dstptr, size, off); -+ break; -+ case COMPAT_CALC_SIZE: -+ *size += off; -+ ret = 0; -+ break; -+ default: -+ ret = -ENOPROTOOPT; -+ break; -+ } -+ return ret; -+} -+#endif -+ - inline unsigned int - alloc_null_binding(struct ip_conntrack *conntrack, - struct ip_nat_info *info, -@@ -286,7 +380,7 @@ int ip_nat_rule_find(struct sk_buff **ps - { - int ret; - -- ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL); -+ ret = ipt_do_table(pskb, hooknum, in, out, ve_ip_nat_table, NULL); - - if (ret == NF_ACCEPT) { - if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum))) -@@ -300,21 +394,33 @@ static struct ipt_target ipt_snat_reg = - .name = "SNAT", - .target = ipt_snat_target, - .checkentry = ipt_snat_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - }; - - static struct ipt_target ipt_dnat_reg = { - .name = "DNAT", - .target = ipt_dnat_target, - .checkentry = ipt_dnat_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - }; - --int __init ip_nat_rule_init(void) -+int ip_nat_rule_init(void) - { - int ret; -+ struct ipt_table *tmp_table; -+ -+ tmp_table = ipt_register_table(&nat_table, -+ &nat_initial_table.repl); -+ if (IS_ERR(tmp_table)) -+ return PTR_ERR(tmp_table); -+#ifdef CONFIG_VE_IPTABLES -+ ve_ip_nat_table = tmp_table; -+#endif - -- ret = ipt_register_table(&nat_table, &nat_initial_table.repl); -- if (ret != 0) -- return ret; - ret = ipt_register_target(&ipt_snat_reg); - if (ret != 0) - goto unregister_table; -@@ -328,7 +434,10 @@ int __init ip_nat_rule_init(void) - unregister_snat: - ipt_unregister_target(&ipt_snat_reg); - unregister_table: -- ipt_unregister_table(&nat_table); -+ ipt_unregister_table(ve_ip_nat_table); -+#ifdef CONFIG_VE_IPTABLES -+ ve_ip_nat_table = NULL; -+#endif - - return ret; - } -@@ -337,5 +446,8 @@ void ip_nat_rule_cleanup(void) - { - ipt_unregister_target(&ipt_dnat_reg); - ipt_unregister_target(&ipt_snat_reg); -- ipt_unregister_table(&nat_table); -+ ipt_unregister_table(ve_ip_nat_table); -+#ifdef CONFIG_VE_IPTABLES -+ ve_ip_nat_table = NULL; -+#endif - } -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_standalone.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_standalone.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_standalone.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_nat_standalone.c 2006-04-19 15:02:12.000000000 +0400 -@@ -30,6 +30,7 @@ - #include <net/ip.h> - #include <net/checksum.h> - #include <linux/spinlock.h> -+#include <linux/nfcalls.h> - - #define ASSERT_READ_LOCK(x) - #define ASSERT_WRITE_LOCK(x) -@@ -358,45 +359,45 @@ static int init_or_cleanup(int init) - { - int ret = 0; - -- need_conntrack(); -- - if (!init) goto cleanup; - --#ifdef CONFIG_XFRM -- BUG_ON(ip_nat_decode_session != NULL); -- ip_nat_decode_session = nat_decode_session; --#endif -+ if (!ve_is_super(get_exec_env())) -+ __module_get(THIS_MODULE); -+ - ret = ip_nat_rule_init(); - if (ret < 0) { - printk("ip_nat_init: can't setup rules.\n"); -- goto cleanup_decode_session; -+ goto cleanup_modput; - } -- ret = nf_register_hook(&ip_nat_in_ops); -+ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) -+ return 0; -+ -+ ret = virt_nf_register_hook(&ip_nat_in_ops); - if (ret < 0) { - printk("ip_nat_init: can't register in hook.\n"); - goto cleanup_rule_init; - } -- ret = nf_register_hook(&ip_nat_out_ops); -+ ret = virt_nf_register_hook(&ip_nat_out_ops); - if (ret < 0) { - printk("ip_nat_init: can't register out hook.\n"); - goto cleanup_inops; - } -- ret = nf_register_hook(&ip_nat_adjust_in_ops); -+ ret = virt_nf_register_hook(&ip_nat_adjust_in_ops); - if (ret < 0) { - printk("ip_nat_init: can't register adjust in hook.\n"); - goto cleanup_outops; - } -- ret = nf_register_hook(&ip_nat_adjust_out_ops); -+ ret = virt_nf_register_hook(&ip_nat_adjust_out_ops); - if (ret < 0) { - printk("ip_nat_init: can't register adjust out hook.\n"); - goto cleanup_adjustin_ops; - } -- ret = nf_register_hook(&ip_nat_local_out_ops); -+ ret = virt_nf_register_hook(&ip_nat_local_out_ops); - if (ret < 0) { - printk("ip_nat_init: can't register local out hook.\n"); - goto cleanup_adjustout_ops;; - } -- ret = nf_register_hook(&ip_nat_local_in_ops); -+ ret = virt_nf_register_hook(&ip_nat_local_in_ops); - if (ret < 0) { - printk("ip_nat_init: can't register local in hook.\n"); - goto cleanup_localoutops; -@@ -404,38 +405,76 @@ static int init_or_cleanup(int init) - return ret; - - cleanup: -- nf_unregister_hook(&ip_nat_local_in_ops); -+ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) -+ goto cleanup_rule_init; -+ virt_nf_unregister_hook(&ip_nat_local_in_ops); - cleanup_localoutops: -- nf_unregister_hook(&ip_nat_local_out_ops); -+ virt_nf_unregister_hook(&ip_nat_local_out_ops); - cleanup_adjustout_ops: -- nf_unregister_hook(&ip_nat_adjust_out_ops); -+ virt_nf_unregister_hook(&ip_nat_adjust_out_ops); - cleanup_adjustin_ops: -- nf_unregister_hook(&ip_nat_adjust_in_ops); -+ virt_nf_unregister_hook(&ip_nat_adjust_in_ops); - cleanup_outops: -- nf_unregister_hook(&ip_nat_out_ops); -+ virt_nf_unregister_hook(&ip_nat_out_ops); - cleanup_inops: -- nf_unregister_hook(&ip_nat_in_ops); -+ virt_nf_unregister_hook(&ip_nat_in_ops); - cleanup_rule_init: - ip_nat_rule_cleanup(); -- cleanup_decode_session: --#ifdef CONFIG_XFRM -- ip_nat_decode_session = NULL; -- synchronize_net(); --#endif -+ cleanup_modput: -+ if (!ve_is_super(get_exec_env())) -+ module_put(THIS_MODULE); - return ret; - } - --static int __init init(void) -+int init_iptable_nat(void) - { - return init_or_cleanup(1); - } - --static void __exit fini(void) -+void fini_iptable_nat(void) - { - init_or_cleanup(0); - } - --module_init(init); -+static int __init init(void) -+{ -+ int err; -+ -+ need_conntrack(); -+ -+#ifdef CONFIG_XFRM -+ BUG_ON(ip_nat_decode_session != NULL); -+ ip_nat_decode_session = nat_decode_session; -+#endif -+ -+ err = init_iptable_nat(); -+ if (err < 0) { -+#ifdef CONFIG_XFRM -+ ip_nat_decode_session = NULL; -+ synchronize_net(); -+#endif -+ return err; -+ } -+ -+ KSYMRESOLVE(init_iptable_nat); -+ KSYMRESOLVE(fini_iptable_nat); -+ KSYMMODRESOLVE(iptable_nat); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(iptable_nat); -+ KSYMUNRESOLVE(init_iptable_nat); -+ KSYMUNRESOLVE(fini_iptable_nat); -+ fini_iptable_nat(); -+#ifdef CONFIG_XFRM -+ ip_nat_decode_session = NULL; -+ synchronize_net(); -+#endif -+} -+ -+fs_initcall(init); - module_exit(fini); - - MODULE_LICENSE("GPL"); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_queue.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_queue.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_queue.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_queue.c 2006-04-19 15:02:12.000000000 +0400 -@@ -542,8 +542,17 @@ ipq_rcv_sk(struct sock *sk, int len) - down(&ipqnl_sem); - - for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { -+#ifdef CONFIG_VE -+ struct ve_struct *env; -+#endif - skb = skb_dequeue(&sk->sk_receive_queue); -+#ifdef CONFIG_VE -+ env = set_exec_env(VE_OWNER_SKB(skb)); - ipq_rcv_skb(skb); -+ (void)set_exec_env(env); -+#else -+ ipq_rcv_skb(skb); -+#endif - kfree_skb(skb); - } - -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_tables.c linux-2.6.16-026test009/net/ipv4/netfilter/ip_tables.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ip_tables.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ip_tables.c 2006-04-19 15:02:12.000000000 +0400 -@@ -24,14 +24,17 @@ - #include <linux/module.h> - #include <linux/icmp.h> - #include <net/ip.h> -+#include <net/compat.h> - #include <asm/uaccess.h> - #include <asm/semaphore.h> - #include <linux/proc_fs.h> - #include <linux/err.h> - #include <linux/cpumask.h> -+#include <ub/ub_mem.h> - - #include <linux/netfilter/x_tables.h> - #include <linux/netfilter_ipv4/ip_tables.h> -+#include <linux/nfcalls.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -@@ -70,6 +73,14 @@ do { \ - #define inline - #endif - -+#ifdef CONFIG_VE_IPTABLES -+/* include ve.h and define get_exec_env */ -+#include <linux/sched.h> -+#define ve_ipt_standard_target (get_exec_env()->_ipt_standard_target) -+#else -+#define ve_ipt_standard_target &ipt_standard_target -+#endif -+ - /* - We keep a set of rules for each CPU, so we can avoid write-locking - them in the softirq when updating the counters and therefore -@@ -480,7 +491,7 @@ standard_check(const struct ipt_entry_ta - if (t->u.target_size - != IPT_ALIGN(sizeof(struct ipt_standard_target))) { - duprintf("standard_check: target size %u != %u\n", -- t->u.target_size, -+ t->u.target_size, (unsigned int) - IPT_ALIGN(sizeof(struct ipt_standard_target))); - return 0; - } -@@ -565,7 +576,7 @@ check_entry(struct ipt_entry *e, const c - } - t->u.kernel.target = target; - -- if (t->u.kernel.target == &ipt_standard_target) { -+ if (t->u.kernel.target == ve_ipt_standard_target) { - if (!standard_check(t, size)) { - ret = -EINVAL; - goto cleanup_matches; -@@ -790,32 +801,45 @@ get_counters(const struct xt_table_info - } - } - --static int --copy_entries_to_user(unsigned int total_size, -- struct ipt_table *table, -- void __user *userptr) -+static inline struct xt_counters * alloc_counters(struct ipt_table *table) - { -- unsigned int off, num, countersize; -- struct ipt_entry *e; -+ unsigned int countersize; - struct xt_counters *counters; - struct xt_table_info *private = table->private; -- int ret = 0; -- void *loc_cpu_entry; - - /* We need atomic snapshot of counters: rest doesn't change - (other than comefrom, which userspace doesn't care - about). */ - countersize = sizeof(struct xt_counters) * private->number; -- counters = vmalloc_node(countersize, numa_node_id()); -+ counters = ub_vmalloc_node(countersize, numa_node_id()); - - if (counters == NULL) -- return -ENOMEM; -+ return ERR_PTR(-ENOMEM); - - /* First, sum counters... */ - write_lock_bh(&table->lock); - get_counters(private, counters); - write_unlock_bh(&table->lock); - -+ return counters; -+} -+ -+static int -+copy_entries_to_user(unsigned int total_size, -+ struct ipt_table *table, -+ void __user *userptr) -+{ -+ unsigned int off, num; -+ struct ipt_entry *e; -+ struct xt_counters *counters; -+ struct xt_table_info *private = table->private; -+ int ret = 0; -+ void *loc_cpu_entry; -+ -+ counters = alloc_counters(table); -+ if (IS_ERR(counters)) -+ return PTR_ERR(counters); -+ - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) -@@ -875,25 +899,391 @@ copy_entries_to_user(unsigned int total_ - return ret; - } - -+#ifdef CONFIG_COMPAT -+static DECLARE_MUTEX(compat_ipt_mutex); -+ -+struct compat_delta { -+ struct compat_delta *next; -+ u_int16_t offset; -+ short delta; -+}; -+ -+static struct compat_delta *compat_offsets = NULL; -+ -+static int compat_add_offset(u_int16_t offset, short delta) -+{ -+ struct compat_delta *tmp; -+ -+ tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ tmp->offset = offset; -+ tmp->delta = delta; -+ if (compat_offsets) { -+ tmp->next = compat_offsets->next; -+ compat_offsets->next = tmp; -+ } else { -+ compat_offsets = tmp; -+ tmp->next = NULL; -+ } -+ return 0; -+} -+ -+static void compat_flush_offsets(void) -+{ -+ struct compat_delta *tmp, *next; -+ -+ if (compat_offsets) { -+ for(tmp = compat_offsets; tmp; tmp = next) { -+ next = tmp->next; -+ kfree(tmp); -+ } -+ compat_offsets = NULL; -+ } -+} -+ -+static short compat_calc_jump(u_int16_t offset) -+{ -+ struct compat_delta *tmp; -+ short delta; -+ -+ for(tmp = compat_offsets, delta = 0; tmp; tmp = tmp->next) -+ if (tmp->offset < offset) -+ delta += tmp->delta; -+ return delta; -+} -+ -+struct compat_ipt_standard_target -+{ -+ struct compat_ipt_entry_target target; -+ compat_int_t verdict; -+}; -+ -+#define IPT_ST_OFFSET (sizeof(struct ipt_standard_target) - \ -+ sizeof(struct compat_ipt_standard_target)) -+ -+struct compat_ipt_standard -+{ -+ struct compat_ipt_entry entry; -+ struct compat_ipt_standard_target target; -+}; -+ -+static int compat_ipt_standard_fn(void *target, -+ void **dstptr, int *size, int convert) -+{ -+ struct compat_ipt_standard_target compat_st, *pcompat_st; -+ struct ipt_standard_target st, *pst; -+ int ret; -+ -+ ret = 0; -+ switch (convert) { -+ case COMPAT_TO_USER: -+ pst = (struct ipt_standard_target *)target; -+ memcpy(&compat_st.target, &pst->target, -+ sizeof(struct ipt_entry_target)); -+ compat_st.verdict = pst->verdict; -+ if (compat_st.verdict > 0) -+ compat_st.verdict -= -+ compat_calc_jump(compat_st.verdict); -+ compat_st.target.u.user.target_size = -+ sizeof(struct compat_ipt_standard_target); -+ if (__copy_to_user(*dstptr, &compat_st, -+ sizeof(struct compat_ipt_standard_target))) -+ ret = -EFAULT; -+ *size -= IPT_ST_OFFSET; -+ *dstptr += sizeof(struct compat_ipt_standard_target); -+ break; -+ case COMPAT_FROM_USER: -+ pcompat_st = -+ (struct compat_ipt_standard_target *)target; -+ memcpy(&st.target, &pcompat_st->target, -+ sizeof(struct ipt_entry_target)); -+ st.verdict = pcompat_st->verdict; -+ if (st.verdict > 0) -+ st.verdict += compat_calc_jump(st.verdict); -+ st.target.u.user.target_size = -+ sizeof(struct ipt_standard_target); -+ memcpy(*dstptr, &st, -+ sizeof(struct ipt_standard_target)); -+ *size += IPT_ST_OFFSET; -+ *dstptr += sizeof(struct ipt_standard_target); -+ break; -+ case COMPAT_CALC_SIZE: -+ *size += IPT_ST_OFFSET; -+ break; -+ default: -+ ret = -ENOPROTOOPT; -+ break; -+ } -+ return ret; -+} -+ -+int ipt_target_align_compat(void *target, void **dstptr, -+ int *size, int off, int convert) -+{ -+ struct compat_ipt_entry_target *pcompat; -+ struct ipt_entry_target *pt; -+ u_int16_t tsize; -+ int ret; -+ -+ ret = 0; -+ switch (convert) { -+ case COMPAT_TO_USER: -+ pt = (struct ipt_entry_target *)target; -+ tsize = pt->u.user.target_size; -+ if (__copy_to_user(*dstptr, pt, tsize)) { -+ ret = -EFAULT; -+ break; -+ } -+ tsize -= off; -+ if (put_user(tsize, (u_int16_t *)*dstptr)) -+ ret = -EFAULT; -+ *size -= off; -+ *dstptr += tsize; -+ break; -+ case COMPAT_FROM_USER: -+ pcompat = (struct compat_ipt_entry_target *)target; -+ pt = (struct ipt_entry_target *)*dstptr; -+ tsize = pcompat->u.user.target_size; -+ memcpy(pt, pcompat, tsize); -+ tsize += off; -+ pt->u.user.target_size = tsize; -+ *size += off; -+ *dstptr += tsize; -+ break; -+ case COMPAT_CALC_SIZE: -+ *size += off; -+ break; -+ default: -+ ret = -ENOPROTOOPT; -+ break; -+ } -+ return ret; -+} -+ -+int ipt_match_align_compat(void *match, void **dstptr, -+ int *size, int off, int convert) -+{ -+ struct compat_ipt_entry_match *pcompat_m; -+ struct ipt_entry_match *pm; -+ u_int16_t msize; -+ int ret; -+ -+ ret = 0; -+ switch (convert) { -+ case COMPAT_TO_USER: -+ pm = (struct ipt_entry_match *)match; -+ msize = pm->u.user.match_size; -+ if (__copy_to_user(*dstptr, pm, msize)) { -+ ret = -EFAULT; -+ break; -+ } -+ msize -= off; -+ if (put_user(msize, (u_int16_t *)*dstptr)) -+ ret = -EFAULT; -+ *size -= off; -+ *dstptr += msize; -+ break; -+ case COMPAT_FROM_USER: -+ pcompat_m = (struct compat_ipt_entry_match *)match; -+ pm = (struct ipt_entry_match *)*dstptr; -+ msize = pcompat_m->u.user.match_size; -+ memcpy(pm, pcompat_m, msize); -+ msize += off; -+ pm->u.user.match_size = msize; -+ *size += off; -+ *dstptr += msize; -+ break; -+ case COMPAT_CALC_SIZE: -+ *size += off; -+ break; -+ default: -+ ret = -ENOPROTOOPT; -+ break; -+ } -+ return ret; -+} -+ -+static int icmp_compat(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_icmp)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_icmp)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+ -+static inline int -+compat_calc_match(struct ipt_entry_match *m, int * size) -+{ -+ if (m->u.kernel.match->compat) -+ m->u.kernel.match->compat(m, NULL, size, COMPAT_CALC_SIZE); -+ return 0; -+} -+ -+static int compat_calc_entry(struct ipt_entry *e, struct xt_table_info *info, -+ void *base, struct xt_table_info *newinfo) -+{ -+ struct ipt_entry_target *t; -+ u_int16_t entry_offset; -+ int off, i, ret; -+ -+ off = 0; -+ entry_offset = (void *)e - base; -+ IPT_MATCH_ITERATE(e, compat_calc_match, &off); -+ t = ipt_get_target(e); -+ if (t->u.kernel.target->compat) -+ t->u.kernel.target->compat(t, NULL, &off, COMPAT_CALC_SIZE); -+ newinfo->size -= off; -+ ret = compat_add_offset(entry_offset, off); -+ if (ret) -+ return ret; -+ -+ for (i = 0; i< NF_IP_NUMHOOKS; i++) { -+ if (info->hook_entry[i] && (e < (struct ipt_entry *) -+ (base + info->hook_entry[i]))) -+ newinfo->hook_entry[i] -= off; -+ if (info->underflow[i] && (e < (struct ipt_entry *) -+ (base + info->underflow[i]))) -+ newinfo->underflow[i] -= off; -+ } -+ return 0; -+} -+ -+static int compat_table_info(struct xt_table_info *info, -+ struct xt_table_info *newinfo) -+{ -+ void *loc_cpu_entry; -+ int i; -+ -+ if (!newinfo || !info) -+ return -EINVAL; -+ -+ memset(newinfo, 0, sizeof(struct xt_table_info)); -+ newinfo->size = info->size; -+ for (i = 0; i < NF_IP_NUMHOOKS; i++) { -+ newinfo->hook_entry[i] = info->hook_entry[i]; -+ newinfo->underflow[i] = info->underflow[i]; -+ } -+ loc_cpu_entry = info->entries[raw_smp_processor_id()]; -+ return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, -+ compat_calc_entry, info, loc_cpu_entry, newinfo); -+} -+#endif -+ -+static int get_info(void __user *user, int *len) -+{ -+ char name[IPT_TABLE_MAXNAMELEN]; -+ struct ipt_table *t; -+ int ret, size; -+ -+#ifdef CONFIG_COMPAT -+ if (is_current_32bits()) -+ size = sizeof(struct compat_ipt_getinfo); -+ else -+#endif -+ size = sizeof(struct ipt_getinfo); -+ -+ if (*len != size) { -+ duprintf("length %u != %u\n", *len, -+ (unsigned int)sizeof(struct ipt_getinfo)); -+ return -EINVAL; -+ } -+ -+ if (copy_from_user(name, user, sizeof(name)) != 0) -+ return -EFAULT; -+ -+ name[IPT_TABLE_MAXNAMELEN-1] = '\0'; -+#ifdef CONFIG_COMPAT -+ down(&compat_ipt_mutex); -+#endif -+ t = try_then_request_module(xt_find_table_lock(AF_INET, name), -+ "iptable_%s", name); -+ if (t && !IS_ERR(t)) { -+ struct ipt_getinfo info; -+ struct xt_table_info *private = t->private; -+#ifdef CONFIG_COMPAT -+ struct compat_ipt_getinfo compat_info; -+#endif -+ void *pinfo; -+ -+#ifdef CONFIG_COMPAT -+ if (is_current_32bits()) { -+ struct xt_table_info tmp; -+ ret = compat_table_info(private, &tmp); -+ compat_flush_offsets(); -+ memcpy(compat_info.hook_entry, tmp.hook_entry, -+ sizeof(compat_info.hook_entry)); -+ memcpy(compat_info.underflow, tmp.underflow, -+ sizeof(compat_info.underflow)); -+ compat_info.valid_hooks = t->valid_hooks; -+ compat_info.num_entries = private->number; -+ compat_info.size = tmp.size; -+ strcpy(compat_info.name, name); -+ pinfo = (void *)&compat_info; -+ } else -+#endif -+ { -+ info.valid_hooks = t->valid_hooks; -+ memcpy(info.hook_entry, private->hook_entry, -+ sizeof(info.hook_entry)); -+ memcpy(info.underflow, private->underflow, -+ sizeof(info.underflow)); -+ info.num_entries = private->number; -+ info.size = private->size; -+ strcpy(info.name, name); -+ pinfo = (void *)&info; -+ } -+ -+ if (copy_to_user(user, pinfo, *len) != 0) -+ ret = -EFAULT; -+ else -+ ret = 0; -+ -+ xt_table_unlock(t); -+ module_put(t->me); -+ } else -+ ret = t ? PTR_ERR(t) : -ENOENT; -+#ifdef CONFIG_COMPAT -+ up(&compat_ipt_mutex); -+#endif -+ return ret; -+} -+ - static int --get_entries(const struct ipt_get_entries *entries, -- struct ipt_get_entries __user *uptr) -+get_entries(struct ipt_get_entries __user *uptr, int *len) - { - int ret; -+ struct ipt_get_entries get; - struct ipt_table *t; - -- t = xt_find_table_lock(AF_INET, entries->name); -+ if (*len < sizeof(get)) { -+ duprintf("get_entries: %u < %d\n", *len, -+ (unsigned int)sizeof(get)); -+ return -EINVAL; -+ } -+ if (copy_from_user(&get, uptr, sizeof(get)) != 0) -+ return -EFAULT; -+ if (*len != sizeof(struct ipt_get_entries) + get.size) { -+ duprintf("get_entries: %u != %u\n", *len, -+ (unsigned int)(sizeof(struct ipt_get_entries) + -+ get.size)); -+ return -EINVAL; -+ } -+ -+ t = xt_find_table_lock(AF_INET, get.name); - if (t && !IS_ERR(t)) { - struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", - private->number); -- if (entries->size == private->size) -+ if (get.size == private->size) - ret = copy_entries_to_user(private->size, - t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, -- entries->size); -+ get.size); - ret = -EINVAL; - } - module_put(t->me); -@@ -905,71 +1295,39 @@ get_entries(const struct ipt_get_entries - } - - static int --do_replace(void __user *user, unsigned int len) -+__do_replace(const char *name, unsigned int valid_hooks, -+ struct xt_table_info *newinfo, unsigned int num_counters, -+ void __user *counters_ptr) - { - int ret; -- struct ipt_replace tmp; - struct ipt_table *t; -- struct xt_table_info *newinfo, *oldinfo; -+ struct xt_table_info *oldinfo; - struct xt_counters *counters; -- void *loc_cpu_entry, *loc_cpu_old_entry; -- -- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) -- return -EFAULT; -- -- /* Hack: Causes ipchains to give correct error msg --RR */ -- if (len != sizeof(tmp) + tmp.size) -- return -ENOPROTOOPT; -- -- /* overflow check */ -- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - -- SMP_CACHE_BYTES) -- return -ENOMEM; -- if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) -- return -ENOMEM; -- -- newinfo = xt_alloc_table_info(tmp.size); -- if (!newinfo) -- return -ENOMEM; -- -- /* choose the copy that is our node/cpu */ -- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; -- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), -- tmp.size) != 0) { -- ret = -EFAULT; -- goto free_newinfo; -- } -+ void *loc_cpu_old_entry; - -- counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters)); -+ ret = 0; -+ counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters)); - if (!counters) { - ret = -ENOMEM; -- goto free_newinfo; -+ goto out; - } - -- ret = translate_table(tmp.name, tmp.valid_hooks, -- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, -- tmp.hook_entry, tmp.underflow); -- if (ret != 0) -- goto free_newinfo_counters; -- -- duprintf("ip_tables: Translated table\n"); -- -- t = try_then_request_module(xt_find_table_lock(AF_INET, tmp.name), -- "iptable_%s", tmp.name); -+ t = try_then_request_module(xt_find_table_lock(AF_INET, name), -+ "iptable_%s", name); - if (!t || IS_ERR(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; - goto free_newinfo_counters_untrans; - } - - /* You lied! */ -- if (tmp.valid_hooks != t->valid_hooks) { -+ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", -- tmp.valid_hooks, t->valid_hooks); -+ valid_hooks, t->valid_hooks); - ret = -EINVAL; - goto put_module; - } - -- oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret); -+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); - if (!oldinfo) - goto put_module; - -@@ -989,8 +1347,8 @@ do_replace(void __user *user, unsigned i - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); - xt_free_table_info(oldinfo); -- if (copy_to_user(tmp.counters, counters, -- sizeof(struct xt_counters) * tmp.num_counters) != 0) -+ if (copy_to_user(counters_ptr, counters, -+ sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; - vfree(counters); - xt_table_unlock(t); -@@ -1000,9 +1358,62 @@ do_replace(void __user *user, unsigned i - module_put(t->me); - xt_table_unlock(t); - free_newinfo_counters_untrans: -- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); -- free_newinfo_counters: - vfree(counters); -+ out: -+ return ret; -+} -+ -+static int -+do_replace(void __user *user, unsigned int len) -+{ -+ int ret; -+ struct ipt_replace tmp; -+ struct xt_table_info *newinfo; -+ void *loc_cpu_entry; -+ -+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) -+ return -EFAULT; -+ -+ /* Hack: Causes ipchains to give correct error msg --RR */ -+ if (len != sizeof(tmp) + tmp.size) -+ return -ENOPROTOOPT; -+ -+ /* overflow check */ -+ if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - -+ SMP_CACHE_BYTES) -+ return -ENOMEM; -+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) -+ return -ENOMEM; -+ -+ newinfo = xt_alloc_table_info(tmp.size); -+ if (!newinfo) -+ return -ENOMEM; -+ -+ /* choose the copy that is our node/cpu */ -+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; -+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), -+ tmp.size) != 0) { -+ ret = -EFAULT; -+ goto free_newinfo; -+ } -+ -+ ret = translate_table(tmp.name, tmp.valid_hooks, -+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, -+ tmp.hook_entry, tmp.underflow); -+ if (ret != 0) -+ goto free_newinfo; -+ -+ duprintf("ip_tables: Translated table\n"); -+ -+ ret = __do_replace(tmp.name, tmp.valid_hooks, -+ newinfo, tmp.num_counters, -+ tmp.counters); -+ if (ret) -+ goto free_newinfo_untrans; -+ return 0; -+ -+ free_newinfo_untrans: -+ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); - free_newinfo: - xt_free_table_info(newinfo); - return ret; -@@ -1034,28 +1445,56 @@ static int - do_add_counters(void __user *user, unsigned int len) - { - unsigned int i; -- struct xt_counters_info tmp, *paddc; -+ struct xt_counters_info tmp; -+ struct xt_counters *paddc; -+ unsigned int num_counters; -+ char *name; -+ int size; -+ void *ptmp; - struct ipt_table *t; - struct xt_table_info *private; - int ret = 0; - void *loc_cpu_entry; -+#ifdef CONFIG_COMPAT -+ struct compat_xt_counters_info compat_tmp; - -- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) -+ if (is_current_32bits()) { -+ ptmp = &compat_tmp; -+ size = sizeof(struct compat_xt_counters_info); -+ } else -+#endif -+ { -+ ptmp = &tmp; -+ size = sizeof(struct xt_counters_info); -+ } -+ -+ if (copy_from_user(ptmp, user, size) != 0) - return -EFAULT; - -- if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters)) -+#ifdef CONFIG_COMPAT -+ if (is_current_32bits()) { -+ num_counters = compat_tmp.num_counters; -+ name = compat_tmp.name; -+ } else -+#endif -+ { -+ num_counters = tmp.num_counters; -+ name = tmp.name; -+ } -+ -+ if (len != size + num_counters * sizeof(struct xt_counters)) - return -EINVAL; - -- paddc = vmalloc_node(len, numa_node_id()); -+ paddc = ub_vmalloc_node(len - size, numa_node_id()); - if (!paddc) - return -ENOMEM; - -- if (copy_from_user(paddc, user, len) != 0) { -+ if (copy_from_user(paddc, user + size, len - size) != 0) { - ret = -EFAULT; - goto free; - } - -- t = xt_find_table_lock(AF_INET, tmp.name); -+ t = xt_find_table_lock(AF_INET, name); - if (!t || IS_ERR(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; - goto free; -@@ -1063,7 +1502,7 @@ do_add_counters(void __user *user, unsig - - write_lock_bh(&t->lock); - private = t->private; -- if (private->number != paddc->num_counters) { -+ if (private->number != num_counters) { - ret = -EINVAL; - goto unlock_up_free; - } -@@ -1074,7 +1513,7 @@ do_add_counters(void __user *user, unsig - IPT_ENTRY_ITERATE(loc_cpu_entry, - private->size, - add_counter_to_entry, -- paddc->counters, -+ paddc, - &i); - unlock_up_free: - write_unlock_bh(&t->lock); -@@ -1086,14 +1525,590 @@ do_add_counters(void __user *user, unsig - return ret; - } - -+#ifdef CONFIG_COMPAT -+struct compat_ipt_replace { -+ char name[IPT_TABLE_MAXNAMELEN]; -+ u32 valid_hooks; -+ u32 num_entries; -+ u32 size; -+ u32 hook_entry[NF_IP_NUMHOOKS]; -+ u32 underflow[NF_IP_NUMHOOKS]; -+ u32 num_counters; -+ compat_uptr_t counters; /* struct ipt_counters * */ -+ struct compat_ipt_entry entries[0]; -+}; -+ -+static inline int compat_copy_match_to_user(struct ipt_entry_match *m, -+ void __user **dstptr, compat_uint_t *size) -+{ -+ if (m->u.kernel.match->compat) -+ m->u.kernel.match->compat(m, dstptr, size, COMPAT_TO_USER); -+ else { -+ if (__copy_to_user(*dstptr, m, m->u.match_size)) -+ return -EFAULT; -+ *dstptr += m->u.match_size; -+ } -+ return 0; -+} -+ -+static int compat_copy_entry_to_user(struct ipt_entry *e, -+ void __user **dstptr, compat_uint_t *size) -+{ -+ struct ipt_entry_target __user *t; -+ struct compat_ipt_entry __user *ce; -+ u_int16_t target_offset, next_offset; -+ compat_uint_t origsize; -+ int ret; -+ -+ ret = -EFAULT; -+ origsize = *size; -+ ce = (struct compat_ipt_entry __user *)*dstptr; -+ if (__copy_to_user(ce, e, sizeof(struct ipt_entry))) -+ goto out; -+ -+ *dstptr += sizeof(struct compat_ipt_entry); -+ ret = IPT_MATCH_ITERATE(e, compat_copy_match_to_user, dstptr, size); -+ target_offset = e->target_offset - (origsize - *size); -+ if (ret) -+ goto out; -+ t = ipt_get_target(e); -+ if (t->u.kernel.target->compat) { -+ ret = t->u.kernel.target->compat(t, -+ dstptr, size, COMPAT_TO_USER); -+ if (ret) -+ goto out; -+ } else { -+ ret = -EFAULT; -+ if (__copy_to_user(*dstptr, t, t->u.target_size)) -+ goto out; -+ *dstptr += t->u.target_size; -+ } -+ ret = -EFAULT; -+ next_offset = e->next_offset - (origsize - *size); -+ if (__put_user(target_offset, &ce->target_offset)) -+ goto out; -+ if (__put_user(next_offset, &ce->next_offset)) -+ goto out; -+ return 0; -+out: -+ return ret; -+} -+ -+static inline int -+compat_check_calc_match(struct ipt_entry_match *m, -+ const char *name, -+ const struct ipt_ip *ip, -+ unsigned int hookmask, -+ int *size, int *i) -+{ -+ struct ipt_match *match; -+ -+ match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, -+ m->u.user.revision), -+ "ipt_%s", m->u.user.name); -+ if (IS_ERR(match) || !match) { -+ duprintf("compat_check_calc_match: `%s' not found\n", -+ m->u.user.name); -+ return match ? PTR_ERR(match) : -ENOENT; -+ } -+ m->u.kernel.match = match; -+ -+ if (m->u.kernel.match->compat) -+ m->u.kernel.match->compat(m, NULL, size, COMPAT_CALC_SIZE); -+ -+ (*i)++; -+ return 0; -+} -+ -+static inline int -+check_compat_entry_size_and_hooks(struct ipt_entry *e, -+ struct xt_table_info *newinfo, -+ unsigned int *size, -+ unsigned char *base, -+ unsigned char *limit, -+ unsigned int *hook_entries, -+ unsigned int *underflows, -+ unsigned int *i, -+ const char *name) -+{ -+ struct ipt_entry_target *t; -+ struct ipt_target *target; -+ u_int16_t entry_offset; -+ int ret, off, h, j; -+ -+ duprintf("check_compat_entry_size_and_hooks %p\n", e); -+ if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 -+ || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { -+ duprintf("Bad offset %p, limit = %p\n", e, limit); -+ return -EINVAL; -+ } -+ -+ if (e->next_offset < sizeof(struct compat_ipt_entry) + -+ sizeof(struct compat_ipt_entry_target)) { -+ duprintf("checking: element %p size %u\n", -+ e, e->next_offset); -+ return -EINVAL; -+ } -+ -+ if (!ip_checkentry(&e->ip)) { -+ duprintf("ip_tables: ip check failed %p %s.\n", e, name); -+ return -EINVAL; -+ } -+ -+ off = 0; -+ entry_offset = (void *)e - (void *)base; -+ j = 0; -+ ret = IPT_MATCH_ITERATE(e, compat_check_calc_match, name, &e->ip, -+ e->comefrom, &off, &j); -+ if (ret != 0) -+ goto out; -+ -+ t = ipt_get_target(e); -+ target = try_then_request_module(xt_find_target(AF_INET, -+ t->u.user.name, -+ t->u.user.revision), -+ "ipt_%s", t->u.user.name); -+ if (IS_ERR(target) || !target) { -+ duprintf("check_entry: `%s' not found\n", t->u.user.name); -+ ret = target ? PTR_ERR(target) : -ENOENT; -+ goto out; -+ } -+ t->u.kernel.target = target; -+ -+ if (t->u.kernel.target->compat) -+ t->u.kernel.target->compat(t, NULL, &off, COMPAT_CALC_SIZE); -+ *size += off; -+ ret = compat_add_offset(entry_offset, off); -+ if (ret) -+ goto out; -+ -+ /* Check hooks & underflows */ -+ for (h = 0; h < NF_IP_NUMHOOKS; h++) { -+ if ((unsigned char *)e - base == hook_entries[h]) -+ newinfo->hook_entry[h] = hook_entries[h]; -+ if ((unsigned char *)e - base == underflows[h]) -+ newinfo->underflow[h] = underflows[h]; -+ } -+ -+ /* Clear counters and comefrom */ -+ e->counters = ((struct ipt_counters) { 0, 0 }); -+ e->comefrom = 0; -+ -+ (*i)++; -+ return 0; -+out: -+ IPT_MATCH_ITERATE(e, cleanup_match, &j); -+ return ret; -+} -+ -+static inline int compat_copy_match_from_user(struct ipt_entry_match *m, -+ void **dstptr, compat_uint_t *size, const char *name, -+ const struct ipt_ip *ip, unsigned int hookmask) -+{ -+ struct ipt_entry_match *dm; -+ -+ dm = (struct ipt_entry_match *)*dstptr; -+ if (m->u.kernel.match->compat) -+ m->u.kernel.match->compat(m, dstptr, size, COMPAT_FROM_USER); -+ else { -+ memcpy(*dstptr, m, m->u.match_size); -+ *dstptr += m->u.match_size; -+ } -+ -+ if (dm->u.kernel.match->checkentry -+ && !dm->u.kernel.match->checkentry(name, ip, dm->data, -+ dm->u.match_size - sizeof(*dm), -+ hookmask)) { -+ module_put(dm->u.kernel.match->me); -+ duprintf("ip_tables: check failed for `%s'.\n", -+ dm->u.kernel.match->name); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, -+ unsigned int *size, const char *name, -+ struct xt_table_info *newinfo, unsigned char *base) -+{ -+ struct ipt_entry_target *t; -+ struct ipt_entry *de; -+ unsigned int origsize; -+ int ret, h; -+ -+ ret = 0; -+ origsize = *size; -+ de = (struct ipt_entry *)*dstptr; -+ memcpy(de, e, sizeof(struct ipt_entry)); -+ -+ *dstptr += sizeof(struct compat_ipt_entry); -+ ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size, -+ name, &de->ip, de->comefrom); -+ if (ret) -+ goto out; -+ de->target_offset = e->target_offset - (origsize - *size); -+ t = ipt_get_target(e); -+ if (t->u.kernel.target->compat) -+ t->u.kernel.target->compat(t, -+ dstptr, size, COMPAT_FROM_USER); -+ else { -+ memcpy(*dstptr, t, t->u.target_size); -+ *dstptr += t->u.target_size; -+ } -+ -+ de->next_offset = e->next_offset - (origsize - *size); -+ for (h = 0; h < NF_IP_NUMHOOKS; h++) { -+ if ((unsigned char *)de - base < newinfo->hook_entry[h]) -+ newinfo->hook_entry[h] -= origsize - *size; -+ if ((unsigned char *)de - base < newinfo->underflow[h]) -+ newinfo->underflow[h] -= origsize - *size; -+ } -+ -+ ret = -EINVAL; -+ t = ipt_get_target(de); -+ if (t->u.kernel.target == &ipt_standard_target) { -+ if (!standard_check(t, *size)) -+ goto out; -+ } else if (t->u.kernel.target->checkentry -+ && !t->u.kernel.target->checkentry(name, de, t->data, -+ t->u.target_size -+ - sizeof(*t), -+ de->comefrom)) { -+ module_put(t->u.kernel.target->me); -+ duprintf("ip_tables: compat: check failed for `%s'.\n", -+ t->u.kernel.target->name); -+ goto out; -+ } -+ ret = 0; -+out: -+ return ret; -+} -+ -+static int -+translate_compat_table(const char *name, -+ unsigned int valid_hooks, -+ struct xt_table_info **pinfo, -+ void **pentry0, -+ unsigned int total_size, -+ unsigned int number, -+ unsigned int *hook_entries, -+ unsigned int *underflows) -+{ -+ unsigned int i; -+ struct xt_table_info *newinfo, *info; -+ void *pos, *entry0, *entry1; -+ unsigned int size; -+ int ret; -+ -+ info = *pinfo; -+ entry0 = *pentry0; -+ size = total_size; -+ info->number = number; -+ -+ /* Init all hooks to impossible value. */ -+ for (i = 0; i < NF_IP_NUMHOOKS; i++) { -+ info->hook_entry[i] = 0xFFFFFFFF; -+ info->underflow[i] = 0xFFFFFFFF; -+ } -+ -+ duprintf("translate_compat_table: size %u\n", info->size); -+ i = 0; -+ down(&compat_ipt_mutex); -+ /* Walk through entries, checking offsets. */ -+ ret = IPT_ENTRY_ITERATE(entry0, total_size, -+ check_compat_entry_size_and_hooks, -+ info, &size, entry0, -+ entry0 + total_size, -+ hook_entries, underflows, &i, name); -+ if (ret != 0) -+ goto out_unlock; -+ -+ ret = -EINVAL; -+ if (i != number) { -+ duprintf("translate_compat_table: %u not %u entries\n", -+ i, number); -+ goto out_unlock; -+ } -+ -+ /* Check hooks all assigned */ -+ for (i = 0; i < NF_IP_NUMHOOKS; i++) { -+ /* Only hooks which are valid */ -+ if (!(valid_hooks & (1 << i))) -+ continue; -+ if (info->hook_entry[i] == 0xFFFFFFFF) { -+ duprintf("Invalid hook entry %u %u\n", -+ i, hook_entries[i]); -+ goto out_unlock; -+ } -+ if (info->underflow[i] == 0xFFFFFFFF) { -+ duprintf("Invalid underflow %u %u\n", -+ i, underflows[i]); -+ goto out_unlock; -+ } -+ } -+ -+ ret = -ENOMEM; -+ newinfo = xt_alloc_table_info(size); -+ if (!newinfo) -+ goto out_unlock; -+ -+ newinfo->number = number; -+ for (i = 0; i < NF_IP_NUMHOOKS; i++) { -+ newinfo->hook_entry[i] = info->hook_entry[i]; -+ newinfo->underflow[i] = info->underflow[i]; -+ } -+ entry1 = newinfo->entries[raw_smp_processor_id()]; -+ pos = entry1; -+ size = total_size; -+ ret = IPT_ENTRY_ITERATE(entry0, total_size, -+ compat_copy_entry_from_user, &pos, &size, -+ name, newinfo, entry1); -+ compat_flush_offsets(); -+ up(&compat_ipt_mutex); -+ if (ret) -+ goto free_newinfo; -+ -+ ret = -ELOOP; -+ if (!mark_source_chains(newinfo, valid_hooks, entry1)) -+ goto free_newinfo; -+ -+ /* And one copy for every other CPU */ -+ for_each_cpu(i) -+ if (newinfo->entries[i] && newinfo->entries[i] != entry1) -+ memcpy(newinfo->entries[i], entry1, newinfo->size); -+ -+ *pinfo = newinfo; -+ *pentry0 = entry1; -+ xt_free_table_info(info); -+ return 0; -+ -+free_newinfo: -+ xt_free_table_info(newinfo); -+out: -+ return ret; -+out_unlock: -+ up(&compat_ipt_mutex); -+ goto out; -+} -+ -+static int -+compat_do_replace(void __user *user, unsigned int len) -+{ -+ int ret; -+ struct compat_ipt_replace tmp; -+ struct xt_table_info *newinfo; -+ void *loc_cpu_entry; -+ -+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) -+ return -EFAULT; -+ -+ /* Hack: Causes ipchains to give correct error msg --RR */ -+ if (len != sizeof(tmp) + tmp.size) -+ return -ENOPROTOOPT; -+ -+ /* overflow check */ -+ if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - -+ SMP_CACHE_BYTES) -+ return -ENOMEM; -+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) -+ return -ENOMEM; -+ -+ newinfo = xt_alloc_table_info(tmp.size); -+ if (!newinfo) -+ return -ENOMEM; -+ -+ /* choose the copy that is our node/cpu */ -+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; -+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), -+ tmp.size) != 0) { -+ ret = -EFAULT; -+ goto free_newinfo; -+ } -+ -+ ret = translate_compat_table(tmp.name, tmp.valid_hooks, -+ &newinfo, &loc_cpu_entry, tmp.size, -+ tmp.num_entries, tmp.hook_entry, tmp.underflow); -+ if (ret != 0) -+ goto free_newinfo; -+ -+ duprintf("compat_do_replace: Translated table\n"); -+ -+ ret = __do_replace(tmp.name, tmp.valid_hooks, -+ newinfo, tmp.num_counters, -+ compat_ptr(tmp.counters)); -+ if (ret) -+ goto free_newinfo_untrans; -+ return 0; -+ -+ free_newinfo_untrans: -+ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); -+ free_newinfo: -+ xt_free_table_info(newinfo); -+ return ret; -+} -+ -+struct compat_ipt_get_entries -+{ -+ char name[IPT_TABLE_MAXNAMELEN]; -+ compat_uint_t size; -+ struct compat_ipt_entry entrytable[0]; -+}; -+ -+static int compat_copy_entries_to_user(unsigned int total_size, -+ struct ipt_table *table, void __user *userptr) -+{ -+ unsigned int off, num; -+ struct compat_ipt_entry e; -+ struct xt_counters *counters; -+ struct xt_table_info *private = table->private; -+ void __user *pos; -+ unsigned int size; -+ int ret = 0; -+ void *loc_cpu_entry; -+ -+ counters = alloc_counters(table); -+ if (IS_ERR(counters)) -+ return PTR_ERR(counters); -+ -+ /* choose the copy that is on our node/cpu, ... -+ * This choice is lazy (because current thread is -+ * allowed to migrate to another cpu) -+ */ -+ loc_cpu_entry = private->entries[raw_smp_processor_id()]; -+ pos = userptr; -+ size = total_size; -+ ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size, -+ compat_copy_entry_to_user, &pos, &size); -+ if (ret) -+ goto free_counters; -+ -+ /* ... then go back and fix counters and names */ -+ for (off = 0, num = 0; off < size; off += e.next_offset, num++) { -+ unsigned int i; -+ struct ipt_entry_match m; -+ struct ipt_entry_target t; -+ -+ ret = -EFAULT; -+ if (copy_from_user(&e, userptr + off, -+ sizeof(struct compat_ipt_entry))) -+ goto free_counters; -+ if (copy_to_user(userptr + off + -+ offsetof(struct compat_ipt_entry, counters), -+ &counters[num], sizeof(counters[num]))) -+ goto free_counters; -+ -+ for (i = sizeof(struct compat_ipt_entry); -+ i < e.target_offset; i += m.u.match_size) { -+ if (copy_from_user(&m, userptr + off + i, -+ sizeof(struct ipt_entry_match))) -+ goto free_counters; -+ if (copy_to_user(userptr + off + i + -+ offsetof(struct ipt_entry_match, u.user.name), -+ m.u.kernel.match->name, -+ strlen(m.u.kernel.match->name) + 1)) -+ goto free_counters; -+ } -+ -+ if (copy_from_user(&t, userptr + off + e.target_offset, -+ sizeof(struct ipt_entry_target))) -+ goto free_counters; -+ if (copy_to_user(userptr + off + e.target_offset + -+ offsetof(struct ipt_entry_target, u.user.name), -+ t.u.kernel.target->name, -+ strlen(t.u.kernel.target->name) + 1)) -+ goto free_counters; -+ } -+ ret = 0; -+free_counters: -+ vfree(counters); -+ return ret; -+} -+ -+static int -+compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len) -+{ -+ int ret; -+ struct compat_ipt_get_entries get; -+ struct ipt_table *t; -+ -+ -+ if (*len < sizeof(get)) { -+ duprintf("compat_get_entries: %u < %u\n", -+ *len, (unsigned int)sizeof(get)); -+ return -EINVAL; -+ } -+ -+ if (copy_from_user(&get, uptr, sizeof(get)) != 0) -+ return -EFAULT; -+ -+ if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { -+ duprintf("compat_get_entries: %u != %u\n", *len, -+ (unsigned int)(sizeof(struct compat_ipt_get_entries) + -+ get.size)); -+ return -EINVAL; -+ } -+ -+ down(&compat_ipt_mutex); -+ t = xt_find_table_lock(AF_INET, get.name); -+ if (t && !IS_ERR(t)) { -+ struct xt_table_info *private = t->private; -+ struct xt_table_info info; -+ duprintf("t->private->number = %u\n", -+ private->number); -+ ret = compat_table_info(private, &info); -+ if (!ret && get.size == info.size) { -+ ret = compat_copy_entries_to_user(private->size, -+ t, uptr->entrytable); -+ } else if (!ret) { -+ duprintf("compat_get_entries: I've got %u not %u!\n", -+ private->size, -+ get.size); -+ ret = -EINVAL; -+ } -+ compat_flush_offsets(); -+ module_put(t->me); -+ xt_table_unlock(t); -+ } else -+ ret = t ? PTR_ERR(t) : -ENOENT; -+ -+ up(&compat_ipt_mutex); -+ return ret; -+} -+ -+static int -+compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -+{ -+ int ret; -+ -+ switch (cmd) { -+ case IPT_SO_GET_INFO: -+ ret = get_info(user, len); -+ break; -+ case IPT_SO_GET_ENTRIES: -+ ret = compat_get_entries(user, len); -+ break; -+ default: -+ duprintf("compat_do_ipt_get_ctl: unknown request %i\n", cmd); -+ ret = -EINVAL; -+ } -+ return ret; -+} -+#endif -+ - static int - do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) - { - int ret; - -- if (!capable(CAP_NET_ADMIN)) -+ if (!capable(CAP_VE_NET_ADMIN)) - return -EPERM; - -+#ifdef CONFIG_COMPAT -+ if (is_current_32bits() && (cmd == IPT_SO_SET_REPLACE)) -+ return compat_do_replace(user, len); -+#endif -+ - switch (cmd) { - case IPT_SO_SET_REPLACE: - ret = do_replace(user, len); -@@ -1116,69 +2131,22 @@ do_ipt_get_ctl(struct sock *sk, int cmd, - { - int ret; - -- if (!capable(CAP_NET_ADMIN)) -+ if (!capable(CAP_VE_NET_ADMIN)) - return -EPERM; - -- switch (cmd) { -- case IPT_SO_GET_INFO: { -- char name[IPT_TABLE_MAXNAMELEN]; -- struct ipt_table *t; -- -- if (*len != sizeof(struct ipt_getinfo)) { -- duprintf("length %u != %u\n", *len, -- sizeof(struct ipt_getinfo)); -- ret = -EINVAL; -- break; -- } -- -- if (copy_from_user(name, user, sizeof(name)) != 0) { -- ret = -EFAULT; -- break; -- } -- name[IPT_TABLE_MAXNAMELEN-1] = '\0'; -- -- t = try_then_request_module(xt_find_table_lock(AF_INET, name), -- "iptable_%s", name); -- if (t && !IS_ERR(t)) { -- struct ipt_getinfo info; -- struct xt_table_info *private = t->private; -- -- info.valid_hooks = t->valid_hooks; -- memcpy(info.hook_entry, private->hook_entry, -- sizeof(info.hook_entry)); -- memcpy(info.underflow, private->underflow, -- sizeof(info.underflow)); -- info.num_entries = private->number; -- info.size = private->size; -- memcpy(info.name, name, sizeof(info.name)); -- -- if (copy_to_user(user, &info, *len) != 0) -- ret = -EFAULT; -- else -- ret = 0; -- xt_table_unlock(t); -- module_put(t->me); -- } else -- ret = t ? PTR_ERR(t) : -ENOENT; -- } -- break; -+#ifdef CONFIG_COMPAT -+ if (is_current_32bits()) -+ return compat_do_ipt_get_ctl(sk, cmd, user, len); -+#endif - -- case IPT_SO_GET_ENTRIES: { -- struct ipt_get_entries get; -+ switch (cmd) { -+ case IPT_SO_GET_INFO: -+ ret = get_info(user, len); -+ break; - -- if (*len < sizeof(get)) { -- duprintf("get_entries: %u < %u\n", *len, sizeof(get)); -- ret = -EINVAL; -- } else if (copy_from_user(&get, user, sizeof(get)) != 0) { -- ret = -EFAULT; -- } else if (*len != sizeof(struct ipt_get_entries) + get.size) { -- duprintf("get_entries: %u != %u\n", *len, -- sizeof(struct ipt_get_entries) + get.size); -- ret = -EINVAL; -- } else -- ret = get_entries(&get, user); -+ case IPT_SO_GET_ENTRIES: -+ ret = get_entries(user, len); - break; -- } - - case IPT_SO_GET_REVISION_MATCH: - case IPT_SO_GET_REVISION_TARGET: { -@@ -1214,7 +2182,8 @@ do_ipt_get_ctl(struct sock *sk, int cmd, - return ret; - } - --int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) -+struct xt_table *ipt_register_table(struct xt_table *table, -+ const struct ipt_replace *repl) - { - int ret; - struct xt_table_info *newinfo; -@@ -1224,7 +2193,7 @@ int ipt_register_table(struct xt_table * - - newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) -- return -ENOMEM; -+ return ERR_PTR(-ENOMEM); - - /* choose the copy on our node/cpu - * but dont care of preemption -@@ -1239,15 +2208,14 @@ int ipt_register_table(struct xt_table * - repl->underflow); - if (ret != 0) { - xt_free_table_info(newinfo); -- return ret; -+ return ERR_PTR(ret); - } - -- if (xt_register_table(table, &bootstrap, newinfo) != 0) { -+ table = virt_xt_register_table(table, &bootstrap, newinfo); -+ if (IS_ERR(table)) - xt_free_table_info(newinfo); -- return ret; -- } - -- return 0; -+ return table; - } - - void ipt_unregister_table(struct ipt_table *table) -@@ -1255,7 +2223,7 @@ void ipt_unregister_table(struct ipt_tab - struct xt_table_info *private; - void *loc_cpu_entry; - -- private = xt_unregister_table(table); -+ private = virt_xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; -@@ -1263,6 +2231,29 @@ void ipt_unregister_table(struct ipt_tab - xt_free_table_info(private); - } - -+void ipt_flush_table(struct xt_table *table) -+{ -+ struct xt_table *t; -+ void *loc_cpu_entry; -+ -+ if (table == NULL) -+ return; -+ -+ t = xt_find_table_lock(AF_INET, table->name); -+ if (t && !IS_ERR(t)) { -+ struct xt_table_info *private; -+ private = t->private; -+ loc_cpu_entry = private->entries[raw_smp_processor_id()]; -+ IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, -+ cleanup_entry, NULL); -+ if (private->number > private->initial_entries) -+ module_put(t->me); -+ private->size = 0; -+ xt_table_unlock(t); -+ module_put(t->me); -+ } -+} -+ - /* Returns 1 if the type and code is matched by the range, 0 otherwise */ - static inline int - icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, -@@ -1327,6 +2318,9 @@ icmp_checkentry(const char *tablename, - /* The built-in targets: standard (NULL) and error. */ - static struct ipt_target ipt_standard_target = { - .name = IPT_STANDARD_TARGET, -+#ifdef CONFIG_COMPAT -+ .compat = &compat_ipt_standard_fn, -+#endif - }; - - static struct ipt_target ipt_error_target = { -@@ -1348,43 +2342,101 @@ static struct ipt_match icmp_matchstruct - .name = "icmp", - .match = &icmp_match, - .checkentry = &icmp_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &icmp_compat, -+#endif - }; - --static int __init init(void) -+static int init_iptables(void) - { - int ret; - -- xt_proto_init(AF_INET); -+ if (ve_ipt_standard_target != NULL) -+ return -EEXIST; -+ -+ ret = xt_register_target(AF_INET, &ipt_standard_target); -+ if (ret) -+ goto out; -+ ve_ipt_standard_target = xt_find_target(AF_INET, IPT_STANDARD_TARGET, 0); -+ if (IS_ERR(ve_ipt_standard_target)) -+ goto out_standard; -+ ret = xt_register_target(AF_INET, &ipt_error_target); -+ if (ret) -+ goto out_error; -+ ret = xt_register_match(AF_INET, &icmp_matchstruct); -+ if (ret) -+ goto out_icmp; -+ ret = xt_proto_init(AF_INET); -+ if (ret) -+ goto out_proc; -+ return 0; -+ -+out_proc: -+ xt_unregister_match(AF_INET, &icmp_matchstruct); -+out_icmp: -+ xt_unregister_target(AF_INET, &ipt_error_target); -+out_error: -+ ve_ipt_standard_target = NULL; -+out_standard: -+ xt_unregister_target(AF_INET, &ipt_standard_target); -+out: -+ return ret; -+} -+ -+static void fini_iptables(void) -+{ -+ xt_proto_fini(AF_INET); -+ xt_unregister_match(AF_INET, &icmp_matchstruct); -+ xt_unregister_target(AF_INET, &ipt_error_target); -+ ve_ipt_standard_target = NULL; -+ xt_unregister_target(AF_INET, &ipt_standard_target); -+} - -- /* Noone else will be downing sem now, so we won't sleep */ -- xt_register_target(AF_INET, &ipt_standard_target); -- xt_register_target(AF_INET, &ipt_error_target); -- xt_register_match(AF_INET, &icmp_matchstruct); -+static int __init init(void) -+{ -+ int ret; -+ -+ ret = init_iptables(); -+ if (ret) -+ goto out; - - /* Register setsockopt */ - ret = nf_register_sockopt(&ipt_sockopts); - if (ret < 0) { - duprintf("Unable to register sockopts.\n"); -- return ret; -+ goto out_sockopts; - } - -+ KSYMRESOLVE(init_iptables); -+ KSYMRESOLVE(fini_iptables); -+ KSYMRESOLVE(ipt_flush_table); -+ KSYMMODRESOLVE(ip_tables); - printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n"); - return 0; -+ -+out_sockopts: -+ fini_iptables(); -+out: -+ return ret; - } - - static void __exit fini(void) - { -+ KSYMMODUNRESOLVE(ip_tables); -+ KSYMUNRESOLVE(init_iptables); -+ KSYMUNRESOLVE(fini_iptables); -+ KSYMUNRESOLVE(ipt_flush_table); - nf_unregister_sockopt(&ipt_sockopts); -- -- xt_unregister_match(AF_INET, &icmp_matchstruct); -- xt_unregister_target(AF_INET, &ipt_error_target); -- xt_unregister_target(AF_INET, &ipt_standard_target); -- -- xt_proto_fini(AF_INET); -+ fini_iptables(); - } - - EXPORT_SYMBOL(ipt_register_table); - EXPORT_SYMBOL(ipt_unregister_table); - EXPORT_SYMBOL(ipt_do_table); --module_init(init); -+#ifdef CONFIG_COMPAT -+EXPORT_SYMBOL(ipt_match_align_compat); -+EXPORT_SYMBOL(ipt_target_align_compat); -+#endif -+EXPORT_SYMBOL(ipt_flush_table); -+subsys_initcall(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_LOG.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_LOG.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_LOG.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_LOG.c 2006-04-19 15:02:12.000000000 +0400 -@@ -18,6 +18,7 @@ - #include <net/udp.h> - #include <net/tcp.h> - #include <net/route.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter.h> - #include <linux/netfilter_ipv4/ip_tables.h> -@@ -463,10 +464,25 @@ static int ipt_log_checkentry(const char - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int ipt_log_compat(void *target, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_log_info)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_log_info)); -+ return ipt_target_align_compat(target, dstptr, size, off, convert); -+} -+#endif -+ - static struct ipt_target ipt_log_reg = { - .name = "LOG", - .target = ipt_log_target, - .checkentry = ipt_log_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = ipt_log_compat, -+#endif - .me = THIS_MODULE, - }; - -@@ -476,24 +492,44 @@ static struct nf_logger ipt_log_logger = - .me = THIS_MODULE, - }; - -+int init_iptable_LOG(void) -+{ -+ return ipt_register_target(&ipt_log_reg); -+} -+ -+void fini_iptable_LOG(void) -+{ -+ ipt_unregister_target(&ipt_log_reg); -+} -+ - static int __init init(void) - { -- if (ipt_register_target(&ipt_log_reg)) -- return -EINVAL; -+ int err; -+ -+ err = init_iptable_LOG(); -+ if (err < 0) -+ return err; - if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { -- printk(KERN_WARNING "ipt_LOG: not logging via system console " -+ ve_printk(VE_LOG, KERN_WARNING "ipt_LOG: not logging via system console " - "since somebody else already registered for PF_INET\n"); - /* we cannot make module load fail here, since otherwise - * iptables userspace would abort */ - } - -+ -+ KSYMRESOLVE(init_iptable_LOG); -+ KSYMRESOLVE(fini_iptable_LOG); -+ KSYMMODRESOLVE(ipt_LOG); - return 0; - } - - static void __exit fini(void) - { -+ KSYMMODUNRESOLVE(ipt_LOG); -+ KSYMUNRESOLVE(init_iptable_LOG); -+ KSYMUNRESOLVE(fini_iptable_LOG); - nf_log_unregister_logger(&ipt_log_logger); -- ipt_unregister_target(&ipt_log_reg); -+ fini_iptable_LOG(); - } - - module_init(init); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_MASQUERADE.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-04-19 15:02:12.000000000 +0400 -@@ -120,6 +120,7 @@ masquerade_target(struct sk_buff **pskb, - return ip_nat_setup_info(ct, &newrange, hooknum); - } - -+#if 0 - static inline int - device_cmp(struct ip_conntrack *i, void *ifindex) - { -@@ -175,6 +176,7 @@ static struct notifier_block masq_dev_no - static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, - }; -+#endif - - static struct ipt_target masquerade = { - .name = "MASQUERADE", -@@ -189,12 +191,16 @@ static int __init init(void) - - ret = ipt_register_target(&masquerade); - -+#if 0 -+/* These notifiers are unnecessary and may -+ lead to oops in virtual environments */ - if (ret == 0) { - /* Register for device down reports */ - register_netdevice_notifier(&masq_dev_notifier); - /* Register IP address change reports */ - register_inetaddr_notifier(&masq_inet_notifier); - } -+#endif - - return ret; - } -@@ -202,8 +208,8 @@ static int __init init(void) - static void __exit fini(void) - { - ipt_unregister_target(&masquerade); -- unregister_netdevice_notifier(&masq_dev_notifier); -- unregister_inetaddr_notifier(&masq_inet_notifier); -+/* unregister_netdevice_notifier(&masq_dev_notifier); -+ unregister_inetaddr_notifier(&masq_inet_notifier); */ - } - - module_init(init); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_REDIRECT.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_REDIRECT.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_REDIRECT.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_REDIRECT.c 2006-04-19 15:02:12.000000000 +0400 -@@ -17,6 +17,7 @@ - #include <linux/inetdevice.h> - #include <net/protocol.h> - #include <net/checksum.h> -+#include <linux/nfcalls.h> - #include <linux/netfilter_ipv4.h> - #include <linux/netfilter_ipv4/ip_nat_rule.h> - -@@ -25,7 +26,7 @@ MODULE_AUTHOR("Netfilter Core Team <core - MODULE_DESCRIPTION("iptables REDIRECT target module"); - - #if 0 --#define DEBUGP printk -+#define DEBUGP ve_printk - #else - #define DEBUGP(format, args...) - #endif -@@ -119,15 +120,37 @@ static struct ipt_target redirect_reg = - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_iptable_REDIRECT(void) - { - return ipt_register_target(&redirect_reg); - } - --static void __exit fini(void) -+void fini_iptable_REDIRECT(void) - { - ipt_unregister_target(&redirect_reg); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_REDIRECT(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_REDIRECT); -+ KSYMRESOLVE(fini_iptable_REDIRECT); -+ KSYMMODRESOLVE(ipt_REDIRECT); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ipt_REDIRECT); -+ KSYMUNRESOLVE(init_iptable_REDIRECT); -+ KSYMUNRESOLVE(fini_iptable_REDIRECT); -+ fini_iptable_REDIRECT(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_REJECT.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_REJECT.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_REJECT.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_REJECT.c 2006-04-19 15:02:12.000000000 +0400 -@@ -22,6 +22,7 @@ - #include <net/ip.h> - #include <net/tcp.h> - #include <net/route.h> -+#include <linux/nfcalls.h> - #include <net/dst.h> - #include <linux/netfilter_ipv4/ip_tables.h> - #include <linux/netfilter_ipv4/ipt_REJECT.h> -@@ -322,22 +323,59 @@ static int check(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat(void *target, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_reject_info)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_reject_info)); -+ return ipt_target_align_compat(target, dstptr, size, off, convert); -+} -+#endif -+ - static struct ipt_target ipt_reject_reg = { - .name = "REJECT", - .target = reject, - .checkentry = check, -+#ifdef CONFIG_COMPAT -+ .compat = compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_iptable_REJECT(void) - { - return ipt_register_target(&ipt_reject_reg); - } - --static void __exit fini(void) -+void fini_iptable_REJECT(void) - { - ipt_unregister_target(&ipt_reject_reg); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_REJECT(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_REJECT); -+ KSYMRESOLVE(fini_iptable_REJECT); -+ KSYMMODRESOLVE(ipt_REJECT); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ipt_REJECT); -+ KSYMUNRESOLVE(init_iptable_REJECT); -+ KSYMUNRESOLVE(fini_iptable_REJECT); -+ fini_iptable_REJECT(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_TCPMSS.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_TCPMSS.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_TCPMSS.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_TCPMSS.c 2006-04-19 15:02:12.000000000 +0400 -@@ -13,6 +13,7 @@ - - #include <linux/ip.h> - #include <net/tcp.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter_ipv4/ip_tables.h> - #include <linux/netfilter_ipv4/ipt_TCPMSS.h> -@@ -242,22 +243,59 @@ ipt_tcpmss_checkentry(const char *tablen - return 0; - } - -+#ifdef CONFIG_COMPAT -+static int ipt_tcpmss_compat(void *target, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_tcpmss_info)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_tcpmss_info)); -+ return ipt_target_align_compat(target, dstptr, size, off, convert); -+} -+#endif -+ - static struct ipt_target ipt_tcpmss_reg = { - .name = "TCPMSS", - .target = ipt_tcpmss_target, - .checkentry = ipt_tcpmss_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = ipt_tcpmss_compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_iptable_TCPMSS(void) - { - return ipt_register_target(&ipt_tcpmss_reg); - } - --static void __exit fini(void) -+void fini_iptable_TCPMSS(void) - { - ipt_unregister_target(&ipt_tcpmss_reg); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_TCPMSS(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_TCPMSS); -+ KSYMRESOLVE(fini_iptable_TCPMSS); -+ KSYMMODRESOLVE(ipt_TCPMSS); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ipt_TCPMSS); -+ KSYMUNRESOLVE(init_iptable_TCPMSS); -+ KSYMUNRESOLVE(fini_iptable_TCPMSS); -+ fini_iptable_TCPMSS(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_TOS.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_TOS.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_TOS.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_TOS.c 2006-04-19 15:02:12.000000000 +0400 -@@ -15,6 +15,7 @@ - - #include <linux/netfilter_ipv4/ip_tables.h> - #include <linux/netfilter_ipv4/ipt_TOS.h> -+#include <linux/nfcalls.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -@@ -83,22 +84,59 @@ checkentry(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat(void *target, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_tos_target_info)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_tos_target_info)); -+ return ipt_target_align_compat(target, dstptr, size, off, convert); -+} -+#endif -+ - static struct ipt_target ipt_tos_reg = { - .name = "TOS", - .target = target, - .checkentry = checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_iptable_TOS(void) - { - return ipt_register_target(&ipt_tos_reg); - } - --static void __exit fini(void) -+void fini_iptable_TOS(void) - { - ipt_unregister_target(&ipt_tos_reg); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_TOS(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_TOS); -+ KSYMRESOLVE(fini_iptable_TOS); -+ KSYMMODRESOLVE(ipt_TOS); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ipt_TOS); -+ KSYMUNRESOLVE(init_iptable_TOS); -+ KSYMUNRESOLVE(fini_iptable_TOS); -+ fini_iptable_TOS(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_multiport.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_multiport.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_multiport.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_multiport.c 2006-04-19 15:02:12.000000000 +0400 -@@ -13,6 +13,7 @@ - #include <linux/types.h> - #include <linux/udp.h> - #include <linux/skbuff.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter_ipv4/ipt_multiport.h> - #include <linux/netfilter_ipv4/ip_tables.h> -@@ -21,6 +22,13 @@ MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); - MODULE_DESCRIPTION("iptables multiple port match module"); - -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_multiport_match (*(get_exec_env()->_multiport_match)) -+#else -+#define ve_multiport_match multiport_match -+#endif -+ - #if 0 - #define duprintf(format, args...) printk(format , ## args) - #else -@@ -174,11 +182,36 @@ checkentry_v1(const char *tablename, - return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1))); - } - -+#ifdef CONFIG_COMPAT -+static int compat(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_multiport)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_multiport)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+ -+static int compat_v1(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_multiport_v1)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_multiport_v1)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+#endif -+ - static struct ipt_match multiport_match = { - .name = "multiport", - .revision = 0, - .match = &match, - .checkentry = &checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - -@@ -187,10 +220,13 @@ static struct ipt_match multiport_match_ - .revision = 1, - .match = &match_v1, - .checkentry = &checkentry_v1, -+#ifdef CONFIG_COMPAT -+ .compat = &compat_v1, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_iptable_multiport(void) - { - int err; - -@@ -204,11 +240,33 @@ static int __init init(void) - return err; - } - --static void __exit fini(void) -+void fini_iptable_multiport(void) - { - ipt_unregister_match(&multiport_match); - ipt_unregister_match(&multiport_match_v1); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_multiport(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_multiport); -+ KSYMRESOLVE(fini_iptable_multiport); -+ KSYMMODRESOLVE(ipt_multiport); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ipt_multiport); -+ KSYMUNRESOLVE(init_iptable_multiport); -+ KSYMUNRESOLVE(fini_iptable_multiport); -+ fini_iptable_multiport(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_tos.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_tos.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_tos.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_tos.c 2006-04-19 15:02:12.000000000 +0400 -@@ -10,6 +10,7 @@ - - #include <linux/module.h> - #include <linux/skbuff.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter_ipv4/ipt_tos.h> - #include <linux/netfilter_ipv4/ip_tables.h> -@@ -17,6 +18,13 @@ - MODULE_LICENSE("GPL"); - MODULE_DESCRIPTION("iptables TOS match module"); - -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_tos_match (*(get_exec_env()->_tos_match)) -+#else -+#define ve_tos_match tos_match -+#endif -+ - static int - match(const struct sk_buff *skb, - const struct net_device *in, -@@ -44,22 +52,59 @@ checkentry(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_tos_info)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_tos_info)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+#endif -+ - static struct ipt_match tos_match = { - .name = "tos", - .match = &match, - .checkentry = &checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_iptable_tos(void) - { - return ipt_register_match(&tos_match); - } - --static void __exit fini(void) -+void fini_iptable_tos(void) - { - ipt_unregister_match(&tos_match); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_tos(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_tos); -+ KSYMRESOLVE(fini_iptable_tos); -+ KSYMMODRESOLVE(ipt_tos); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ipt_tos); -+ KSYMUNRESOLVE(init_iptable_tos); -+ KSYMUNRESOLVE(fini_iptable_tos); -+ fini_iptable_tos(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_ttl.c linux-2.6.16-026test009/net/ipv4/netfilter/ipt_ttl.c ---- linux-2.6.16.orig/net/ipv4/netfilter/ipt_ttl.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/ipt_ttl.c 2006-04-19 15:02:12.000000000 +0400 -@@ -11,6 +11,7 @@ - - #include <linux/module.h> - #include <linux/skbuff.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter_ipv4/ipt_ttl.h> - #include <linux/netfilter_ipv4/ip_tables.h> -@@ -57,22 +58,58 @@ static int checkentry(const char *tablen - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = IPT_ALIGN(sizeof(struct ipt_ttl_info)) - -+ COMPAT_IPT_ALIGN(sizeof(struct ipt_ttl_info)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+#endif -+ - static struct ipt_match ttl_match = { - .name = "ttl", - .match = &match, - .checkentry = &checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_iptable_ttl(void) - { - return ipt_register_match(&ttl_match); - } - --static void __exit fini(void) -+void fini_iptable_ttl(void) - { - ipt_unregister_match(&ttl_match); -+} - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_ttl(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_ttl); -+ KSYMRESOLVE(fini_iptable_ttl); -+ KSYMMODRESOLVE(ipt_ttl); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(ipt_ttl); -+ KSYMUNRESOLVE(init_iptable_ttl); -+ KSYMUNRESOLVE(fini_iptable_ttl); -+ fini_iptable_ttl(); - } - - module_init(init); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_filter.c linux-2.6.16-026test009/net/ipv4/netfilter/iptable_filter.c ---- linux-2.6.16.orig/net/ipv4/netfilter/iptable_filter.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/iptable_filter.c 2006-04-19 15:02:12.000000000 +0400 -@@ -12,12 +12,20 @@ - - #include <linux/module.h> - #include <linux/moduleparam.h> -+#include <linux/nfcalls.h> - #include <linux/netfilter_ipv4/ip_tables.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); - MODULE_DESCRIPTION("iptables filter table"); - -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_packet_filter (get_exec_env()->_ve_ipt_filter_pf) -+#else -+#define ve_packet_filter &packet_filter -+#endif -+ - #define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) - - static struct -@@ -25,7 +33,7 @@ static struct - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; --} initial_table __initdata -+} initial_table - = { { "filter", FILTER_VALID_HOOKS, 4, - sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - { [NF_IP_LOCAL_IN] = 0, -@@ -90,7 +98,7 @@ ipt_hook(unsigned int hook, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) - { -- return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); -+ return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL); - } - - static unsigned int -@@ -108,7 +116,7 @@ ipt_local_out_hook(unsigned int hook, - return NF_ACCEPT; - } - -- return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); -+ return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL); - } - - static struct nf_hook_ops ipt_ops[] = { -@@ -139,56 +147,89 @@ static struct nf_hook_ops ipt_ops[] = { - static int forward = NF_ACCEPT; - module_param(forward, bool, 0000); - --static int __init init(void) -+int init_iptable_filter(void) - { - int ret; -- -- if (forward < 0 || forward > NF_MAX_VERDICT) { -- printk("iptables forward must be 0 or 1\n"); -- return -EINVAL; -- } -- -- /* Entry 1 is the FORWARD hook */ -- initial_table.entries[1].target.verdict = -forward - 1; -+ struct ipt_table *tmp_filter; - - /* Register table */ -- ret = ipt_register_table(&packet_filter, &initial_table.repl); -- if (ret < 0) -- return ret; -+ tmp_filter = ipt_register_table(&packet_filter, -+ &initial_table.repl); -+ if (IS_ERR(tmp_filter)) -+ return PTR_ERR(tmp_filter); -+#ifdef CONFIG_VE_IPTABLES -+ ve_packet_filter = tmp_filter; -+#endif - - /* Register hooks */ -- ret = nf_register_hook(&ipt_ops[0]); -+ ret = virt_nf_register_hook(&ipt_ops[0]); - if (ret < 0) - goto cleanup_table; - -- ret = nf_register_hook(&ipt_ops[1]); -+ ret = virt_nf_register_hook(&ipt_ops[1]); - if (ret < 0) - goto cleanup_hook0; - -- ret = nf_register_hook(&ipt_ops[2]); -+ ret = virt_nf_register_hook(&ipt_ops[2]); - if (ret < 0) - goto cleanup_hook1; - - return ret; - - cleanup_hook1: -- nf_unregister_hook(&ipt_ops[1]); -+ virt_nf_unregister_hook(&ipt_ops[1]); - cleanup_hook0: -- nf_unregister_hook(&ipt_ops[0]); -+ virt_nf_unregister_hook(&ipt_ops[0]); - cleanup_table: -- ipt_unregister_table(&packet_filter); -+ ipt_unregister_table(ve_packet_filter); -+#ifdef CONFIG_VE_IPTABLES -+ ve_packet_filter = NULL; -+#endif - - return ret; - } - --static void __exit fini(void) -+void fini_iptable_filter(void) - { - unsigned int i; - - for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) -- nf_unregister_hook(&ipt_ops[i]); -+ virt_nf_unregister_hook(&ipt_ops[i]); - -- ipt_unregister_table(&packet_filter); -+ ipt_unregister_table(ve_packet_filter); -+#ifdef CONFIG_VE_IPTABLES -+ ve_packet_filter = NULL; -+#endif -+} -+ -+static int __init init(void) -+{ -+ int err; -+ -+ if (forward < 0 || forward > NF_MAX_VERDICT) { -+ printk("iptables forward must be 0 or 1\n"); -+ return -EINVAL; -+ } -+ -+ /* Entry 1 is the FORWARD hook */ -+ initial_table.entries[1].target.verdict = -forward - 1; -+ -+ err = init_iptable_filter(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_filter); -+ KSYMRESOLVE(fini_iptable_filter); -+ KSYMMODRESOLVE(iptable_filter); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(iptable_filter); -+ KSYMUNRESOLVE(init_iptable_filter); -+ KSYMUNRESOLVE(fini_iptable_filter); -+ fini_iptable_filter(); - } - - module_init(init); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_mangle.c linux-2.6.16-026test009/net/ipv4/netfilter/iptable_mangle.c ---- linux-2.6.16.orig/net/ipv4/netfilter/iptable_mangle.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/iptable_mangle.c 2006-04-19 15:02:12.000000000 +0400 -@@ -17,6 +17,7 @@ - #include <linux/skbuff.h> - #include <net/sock.h> - #include <net/route.h> -+#include <linux/nfcalls.h> - #include <linux/ip.h> - - MODULE_LICENSE("GPL"); -@@ -35,7 +36,7 @@ static struct - struct ipt_replace repl; - struct ipt_standard entries[5]; - struct ipt_error term; --} initial_table __initdata -+} initial_table - = { { "mangle", MANGLE_VALID_HOOKS, 6, - sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), - { [NF_IP_PRE_ROUTING] = 0, -@@ -112,6 +113,13 @@ static struct ipt_table packet_mangler = - .af = AF_INET, - }; - -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_packet_mangler (get_exec_env()->_ipt_mangle_table) -+#else -+#define ve_packet_mangler &packet_mangler -+#endif -+ - /* The work comes in here from netfilter.c. */ - static unsigned int - ipt_route_hook(unsigned int hook, -@@ -120,7 +128,7 @@ ipt_route_hook(unsigned int hook, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) - { -- return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); -+ return ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); - } - - static unsigned int -@@ -149,7 +157,8 @@ ipt_local_hook(unsigned int hook, - daddr = (*pskb)->nh.iph->daddr; - tos = (*pskb)->nh.iph->tos; - -- ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); -+ ret = ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); -+ - /* Reroute for ANY change. */ - if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE - && ((*pskb)->nh.iph->saddr != saddr -@@ -201,60 +210,103 @@ static struct nf_hook_ops ipt_ops[] = { - }, - }; - --static int __init init(void) -+static int mangle_init(struct nf_hook_ops ipt_ops[]) - { - int ret; -+ struct ipt_table *tmp_mangler; - - /* Register table */ -- ret = ipt_register_table(&packet_mangler, &initial_table.repl); -- if (ret < 0) -- return ret; -+ tmp_mangler = ipt_register_table(&packet_mangler, -+ &initial_table.repl); -+ if (IS_ERR(tmp_mangler)) -+ return PTR_ERR(tmp_mangler); -+#ifdef CONFIG_VE_IPTABLES -+ ve_packet_mangler = tmp_mangler; -+#endif - - /* Register hooks */ -- ret = nf_register_hook(&ipt_ops[0]); -+ ret = virt_nf_register_hook(&ipt_ops[0]); - if (ret < 0) - goto cleanup_table; - -- ret = nf_register_hook(&ipt_ops[1]); -+ ret = virt_nf_register_hook(&ipt_ops[1]); - if (ret < 0) - goto cleanup_hook0; - -- ret = nf_register_hook(&ipt_ops[2]); -+ ret = virt_nf_register_hook(&ipt_ops[2]); - if (ret < 0) - goto cleanup_hook1; - -- ret = nf_register_hook(&ipt_ops[3]); -+ ret = virt_nf_register_hook(&ipt_ops[3]); - if (ret < 0) - goto cleanup_hook2; - -- ret = nf_register_hook(&ipt_ops[4]); -+ ret = virt_nf_register_hook(&ipt_ops[4]); - if (ret < 0) - goto cleanup_hook3; - - return ret; - - cleanup_hook3: -- nf_unregister_hook(&ipt_ops[3]); -+ virt_nf_unregister_hook(&ipt_ops[3]); - cleanup_hook2: -- nf_unregister_hook(&ipt_ops[2]); -+ virt_nf_unregister_hook(&ipt_ops[2]); - cleanup_hook1: -- nf_unregister_hook(&ipt_ops[1]); -+ virt_nf_unregister_hook(&ipt_ops[1]); - cleanup_hook0: -- nf_unregister_hook(&ipt_ops[0]); -+ virt_nf_unregister_hook(&ipt_ops[0]); - cleanup_table: -- ipt_unregister_table(&packet_mangler); -+ ipt_unregister_table(ve_packet_mangler); -+#ifdef CONFIG_VE_IPTABLES -+ ve_packet_mangler = NULL; -+#endif - - return ret; - } - --static void __exit fini(void) -+static void mangle_fini(struct nf_hook_ops ipt_ops[]) - { - unsigned int i; - -- for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) -- nf_unregister_hook(&ipt_ops[i]); -+ for (i = 0; i < 5; i++) -+ virt_nf_unregister_hook(&ipt_ops[i]); -+ -+ ipt_unregister_table(ve_packet_mangler); -+#ifdef CONFIG_VE_IPTABLES -+ ve_packet_mangler = NULL; -+#endif -+} -+ -+int init_iptable_mangle(void) -+{ -+ return mangle_init(ipt_ops); -+} -+ -+void fini_iptable_mangle(void) -+{ -+ mangle_fini(ipt_ops); -+} -+ -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_iptable_mangle(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_iptable_mangle); -+ KSYMRESOLVE(fini_iptable_mangle); -+ KSYMMODRESOLVE(iptable_mangle); -+ return 0; -+} - -- ipt_unregister_table(&packet_mangler); -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(iptable_mangle); -+ KSYMUNRESOLVE(init_iptable_mangle); -+ KSYMUNRESOLVE(fini_iptable_mangle); -+ fini_iptable_mangle(); - } - - module_init(init); -diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_raw.c linux-2.6.16-026test009/net/ipv4/netfilter/iptable_raw.c ---- linux-2.6.16.orig/net/ipv4/netfilter/iptable_raw.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/netfilter/iptable_raw.c 2006-04-19 15:02:12.000000000 +0400 -@@ -118,12 +118,13 @@ static struct nf_hook_ops ipt_ops[] = { - - static int __init init(void) - { -+ struct ipt_table *tmp; - int ret; - - /* Register table */ -- ret = ipt_register_table(&packet_raw, &initial_table.repl); -- if (ret < 0) -- return ret; -+ tmp = ipt_register_table(&packet_raw, &initial_table.repl); -+ if (IS_ERR(tmp)) -+ return PTR_ERR(tmp); - - /* Register hooks */ - ret = nf_register_hook(&ipt_ops[0]); -diff -upr linux-2.6.16.orig/net/ipv4/proc.c linux-2.6.16-026test009/net/ipv4/proc.c ---- linux-2.6.16.orig/net/ipv4/proc.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/proc.c 2006-04-19 15:02:12.000000000 +0400 -@@ -258,11 +258,12 @@ static int snmp_seq_show(struct seq_file - seq_printf(seq, " %s", snmp4_ipstats_list[i].name); - - seq_printf(seq, "\nIp: %d %d", -- ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl); -+ ve_ipv4_devconf.forwarding ? 1 : 2, -+ sysctl_ip_default_ttl); - - for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) - seq_printf(seq, " %lu", -- fold_field((void **) ip_statistics, -+ fold_field((void **) ve_ip_statistics, - snmp4_ipstats_list[i].entry)); - - seq_puts(seq, "\nIcmp:"); -@@ -272,7 +273,7 @@ static int snmp_seq_show(struct seq_file - seq_puts(seq, "\nIcmp:"); - for (i = 0; snmp4_icmp_list[i].name != NULL; i++) - seq_printf(seq, " %lu", -- fold_field((void **) icmp_statistics, -+ fold_field((void **) ve_icmp_statistics, - snmp4_icmp_list[i].entry)); - - seq_puts(seq, "\nTcp:"); -@@ -284,11 +285,11 @@ static int snmp_seq_show(struct seq_file - /* MaxConn field is signed, RFC 2012 */ - if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) - seq_printf(seq, " %ld", -- fold_field((void **) tcp_statistics, -+ fold_field((void **) ve_tcp_statistics, - snmp4_tcp_list[i].entry)); - else - seq_printf(seq, " %lu", -- fold_field((void **) tcp_statistics, -+ fold_field((void **) ve_tcp_statistics, - snmp4_tcp_list[i].entry)); - } - -@@ -299,7 +300,7 @@ static int snmp_seq_show(struct seq_file - seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) - seq_printf(seq, " %lu", -- fold_field((void **) udp_statistics, -+ fold_field((void **) ve_udp_statistics, - snmp4_udp_list[i].entry)); - - seq_putc(seq, '\n'); -@@ -333,7 +334,7 @@ static int netstat_seq_show(struct seq_f - seq_puts(seq, "\nTcpExt:"); - for (i = 0; snmp4_net_list[i].name != NULL; i++) - seq_printf(seq, " %lu", -- fold_field((void **) net_statistics, -+ fold_field((void **) ve_net_statistics, - snmp4_net_list[i].entry)); - - seq_putc(seq, '\n'); -@@ -357,10 +358,10 @@ int __init ip_misc_proc_init(void) - { - int rc = 0; - -- if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops)) -+ if (!proc_glob_fops_create("net/netstat", S_IRUGO, &netstat_seq_fops)) - goto out_netstat; - -- if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops)) -+ if (!proc_glob_fops_create("net/snmp", S_IRUGO, &snmp_seq_fops)) - goto out_snmp; - - if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops)) -@@ -368,9 +369,9 @@ int __init ip_misc_proc_init(void) - out: - return rc; - out_sockstat: -- proc_net_remove("snmp"); -+ remove_proc_glob_entry("net/snmp", NULL); - out_snmp: -- proc_net_remove("netstat"); -+ remove_proc_glob_entry("net/netstat", NULL); - out_netstat: - rc = -ENOMEM; - goto out; -diff -upr linux-2.6.16.orig/net/ipv4/raw.c linux-2.6.16-026test009/net/ipv4/raw.c ---- linux-2.6.16.orig/net/ipv4/raw.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/raw.c 2006-04-19 15:02:12.000000000 +0400 -@@ -114,7 +114,8 @@ struct sock *__raw_v4_lookup(struct sock - if (inet->num == num && - !(inet->daddr && inet->daddr != raddr) && - !(inet->rcv_saddr && inet->rcv_saddr != laddr) && -- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) -+ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) && -+ ve_accessible_strict(VE_OWNER_SK(sk), get_exec_env())) - goto found; /* gotcha */ - } - sk = NULL; -@@ -753,8 +754,12 @@ static struct sock *raw_get_first(struct - struct hlist_node *node; - - sk_for_each(sk, node, &raw_v4_htable[state->bucket]) -- if (sk->sk_family == PF_INET) -+ if (sk->sk_family == PF_INET) { -+ if (!ve_accessible(VE_OWNER_SK(sk), -+ get_exec_env())) -+ continue; - goto found; -+ } - } - sk = NULL; - found: -@@ -768,8 +773,14 @@ static struct sock *raw_get_next(struct - do { - sk = sk_next(sk); - try_again: -- ; -- } while (sk && sk->sk_family != PF_INET); -+ if (!sk) -+ break; -+ if (sk->sk_family != PF_INET) -+ continue; -+ if (ve_accessible(VE_OWNER_SK(sk), -+ get_exec_env())) -+ break; -+ } while (1); - - if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { - sk = sk_head(&raw_v4_htable[state->bucket]); -@@ -886,13 +897,13 @@ static struct file_operations raw_seq_fo - - int __init raw_proc_init(void) - { -- if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops)) -+ if (!proc_glob_fops_create("net/raw", S_IRUGO, &raw_seq_fops)) - return -ENOMEM; - return 0; - } - - void __init raw_proc_exit(void) - { -- proc_net_remove("raw"); -+ remove_proc_glob_entry("net/raw", NULL); - } - #endif /* CONFIG_PROC_FS */ -diff -upr linux-2.6.16.orig/net/ipv4/route.c linux-2.6.16-026test009/net/ipv4/route.c ---- linux-2.6.16.orig/net/ipv4/route.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/route.c 2006-04-19 15:02:12.000000000 +0400 -@@ -114,6 +114,8 @@ - - #define RT_GC_TIMEOUT (300*HZ) - -+int ip_rt_src_check = 1; -+ - static int ip_rt_min_delay = 2 * HZ; - static int ip_rt_max_delay = 10 * HZ; - static int ip_rt_max_size; -@@ -253,11 +255,28 @@ static unsigned int rt_hash_code(u32 dad - & rt_hash_mask); - } - -+void prepare_rt_cache(void) -+{ -+#ifdef CONFIG_VE -+ struct rtable *r; -+ int i; -+ -+ for (i = rt_hash_mask; i >= 0; i--) { -+ spin_lock_bh(rt_hash_lock_addr(i)); -+ for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) { -+ r->fl.owner_env = get_ve0(); -+ } -+ spin_unlock_bh(rt_hash_lock_addr(i)); -+ } -+#endif -+} -+ - #ifdef CONFIG_PROC_FS - struct rt_cache_iter_state { - int bucket; - }; - -+static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r); - static struct rtable *rt_cache_get_first(struct seq_file *seq) - { - struct rtable *r = NULL; -@@ -270,6 +289,8 @@ static struct rtable *rt_cache_get_first - break; - rcu_read_unlock_bh(); - } -+ if (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())) -+ r = rt_cache_get_next(seq, r); - return r; - } - -@@ -277,14 +298,19 @@ static struct rtable *rt_cache_get_next( - { - struct rt_cache_iter_state *st = rcu_dereference(seq->private); - -- r = r->u.rt_next; -+start: -+ do { -+ r = r->u.rt_next; -+ } while (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())); - while (!r) { - rcu_read_unlock_bh(); - if (--st->bucket < 0) -- break; -+ goto out; - rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; - } -+ goto start; -+out: - return r; - } - -@@ -556,7 +582,8 @@ static inline int compare_keys(struct fl - { - return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && - fl1->oif == fl2->oif && -- fl1->iif == fl2->iif; -+ fl1->iif == fl2->iif && -+ ve_accessible_strict(fl1->owner_env, fl2->owner_env); - } - - #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED -@@ -670,26 +697,105 @@ static void rt_check_expire(unsigned lon - mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); - } - -+typedef unsigned long rt_flush_gen_t; -+ -+#ifdef CONFIG_VE -+ -+static rt_flush_gen_t rt_flush_gen; -+ -+/* called under rt_flush_lock */ -+static void set_rt_flush_required(struct ve_struct *env) -+{ -+ /* -+ * If the global generation rt_flush_gen is equal to G, then -+ * the pass considering entries labelled by G is yet to come. -+ */ -+ env->rt_flush_required = rt_flush_gen; -+} -+ -+static spinlock_t rt_flush_lock; -+static rt_flush_gen_t reset_rt_flush_required(void) -+{ -+ rt_flush_gen_t g; -+ -+ spin_lock_bh(&rt_flush_lock); -+ g = rt_flush_gen++; -+ spin_unlock_bh(&rt_flush_lock); -+ return g; -+} -+ -+static int check_rt_flush_required(struct ve_struct *env, rt_flush_gen_t gen) -+{ -+ /* can be checked without the lock */ -+ return env->rt_flush_required >= gen; -+} -+ -+#else -+ -+static void set_rt_flush_required(struct ve_struct *env) -+{ -+} -+ -+static rt_flush_gen_t reset_rt_flush_required(void) -+{ -+ return 0; -+} -+ -+#endif -+ - /* This can run from both BH and non-BH contexts, the latter - * in the case of a forced flush event. - */ - static void rt_run_flush(unsigned long dummy) - { - int i; -- struct rtable *rth, *next; -+ struct rtable * rth, * next; -+ struct rtable * tail; -+ rt_flush_gen_t gen; - - rt_deadline = 0; - - get_random_bytes(&rt_hash_rnd, 4); - -+ gen = reset_rt_flush_required(); -+ - for (i = rt_hash_mask; i >= 0; i--) { -+#ifdef CONFIG_VE -+ struct rtable ** prev, * p; -+ -+ spin_lock_bh(rt_hash_lock_addr(i)); -+ rth = rt_hash_table[i].chain; -+ -+ /* defer releasing the head of the list after spin_unlock */ -+ for (tail = rth; tail; tail = tail->u.rt_next) -+ if (!check_rt_flush_required(tail->fl.owner_env, gen)) -+ break; -+ if (rth != tail) -+ rt_hash_table[i].chain = tail; -+ -+ /* call rt_free on entries after the tail requiring flush */ -+ prev = &rt_hash_table[i].chain; -+ for (p = *prev; p; p = next) { -+ next = p->u.rt_next; -+ if (!check_rt_flush_required(p->fl.owner_env, gen)) { -+ prev = &p->u.rt_next; -+ } else { -+ *prev = next; -+ rt_free(p); -+ } -+ } -+ -+#else - spin_lock_bh(rt_hash_lock_addr(i)); - rth = rt_hash_table[i].chain; - if (rth) - rt_hash_table[i].chain = NULL; -+ tail = NULL; -+ -+#endif - spin_unlock_bh(rt_hash_lock_addr(i)); - -- for (; rth; rth = next) { -+ for (; rth != tail; rth = next) { - next = rth->u.rt_next; - rt_free(rth); - } -@@ -728,6 +834,8 @@ void rt_cache_flush(int delay) - delay = tmo; - } - -+ set_rt_flush_required(get_exec_env()); -+ - if (delay <= 0) { - spin_unlock_bh(&rt_flush_lock); - rt_run_flush(0); -@@ -743,9 +851,30 @@ void rt_cache_flush(int delay) - - static void rt_secret_rebuild(unsigned long dummy) - { -+ int i; -+ struct rtable *rth, *next; - unsigned long now = jiffies; - -- rt_cache_flush(0); -+ spin_lock_bh(&rt_flush_lock); -+ del_timer(&rt_flush_timer); -+ spin_unlock_bh(&rt_flush_lock); -+ -+ rt_deadline = 0; -+ get_random_bytes(&rt_hash_rnd, 4); -+ -+ for (i = rt_hash_mask; i >= 0; i--) { -+ spin_lock_bh(rt_hash_lock_addr(i)); -+ rth = rt_hash_table[i].chain; -+ if (rth) -+ rt_hash_table[i].chain = NULL; -+ spin_unlock_bh(rt_hash_lock_addr(i)); -+ -+ for (; rth; rth = next) { -+ next = rth->u.rt_next; -+ rt_free(rth); -+ } -+ } -+ - mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); - } - -@@ -1118,7 +1247,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd - struct rtable *rth, **rthp; - u32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; -+ struct ve_struct *ve; - -+ ve = get_exec_env(); - tos &= IPTOS_RT_MASK; - - if (!in_dev) -@@ -1154,6 +1285,10 @@ void ip_rt_redirect(u32 old_gw, u32 dadd - rth->fl.fl4_src != skeys[i] || - rth->fl.fl4_tos != tos || - rth->fl.oif != ikeys[k] || -+#ifdef CONFIG_VE -+ !ve_accessible_strict(rth->fl.owner_env, -+ ve) || -+#endif - rth->fl.iif != 0) { - rthp = &rth->u.rt_next; - continue; -@@ -1192,6 +1327,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd - rt->u.dst.neighbour = NULL; - rt->u.dst.hh = NULL; - rt->u.dst.xfrm = NULL; -+#ifdef CONFIG_VE -+ rt->fl.owner_env = ve; -+#endif - - rt->rt_flags |= RTCF_REDIRECTED; - -@@ -1631,6 +1769,9 @@ static int ip_route_input_mc(struct sk_b - #ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark= skb->nfmark; - #endif -+#ifdef CONFIG_VE -+ rth->fl.owner_env = get_exec_env(); -+#endif - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; - #ifdef CONFIG_NET_CLS_ROUTE -@@ -1776,6 +1917,9 @@ static inline int __mkroute_input(struct - #ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark= skb->nfmark; - #endif -+#ifdef CONFIG_VE -+ rth->fl.owner_env = get_exec_env(); -+#endif - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; - rth->rt_gateway = daddr; -@@ -2021,6 +2165,9 @@ local_input: - #ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark= skb->nfmark; - #endif -+#ifdef CONFIG_VE -+ rth->fl.owner_env = get_exec_env(); -+#endif - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; - #ifdef CONFIG_NET_CLS_ROUTE -@@ -2100,6 +2247,9 @@ int ip_route_input(struct sk_buff *skb, - #ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark == skb->nfmark && - #endif -+#ifdef CONFIG_VE -+ rth->fl.owner_env == get_exec_env() && -+#endif - rth->fl.fl4_tos == tos) { - rth->u.dst.lastuse = jiffies; - dst_hold(&rth->u.dst); -@@ -2226,6 +2376,9 @@ static inline int __mkroute_output(struc - #ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark= oldflp->fl4_fwmark; - #endif -+#ifdef CONFIG_VE -+ rth->fl.owner_env = get_exec_env(); -+#endif - rth->rt_dst = fl->fl4_dst; - rth->rt_src = fl->fl4_src; - rth->rt_iif = oldflp->oif ? : dev_out->ifindex; -@@ -2399,10 +2552,13 @@ static int ip_route_output_slow(struct r - ZERONET(oldflp->fl4_src)) - goto out; - -- /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ -- dev_out = ip_dev_find(oldflp->fl4_src); -- if (dev_out == NULL) -- goto out; -+ if (ip_rt_src_check) { -+ /* It is equivalent to -+ inet_addr_type(saddr) == RTN_LOCAL */ -+ dev_out = ip_dev_find(oldflp->fl4_src); -+ if (dev_out == NULL) -+ goto out; -+ } - - /* I removed check for oif == dev_out->oif here. - It was wrong for two reasons: -@@ -2429,6 +2585,12 @@ static int ip_route_output_slow(struct r - Luckily, this hack is good workaround. - */ - -+ if (dev_out == NULL) { -+ dev_out = ip_dev_find(oldflp->fl4_src); -+ if (dev_out == NULL) -+ goto out; -+ } -+ - fl.oif = dev_out->ifindex; - goto make_route; - } -@@ -2575,6 +2737,7 @@ int __ip_route_output_key(struct rtable - #ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark == flp->fl4_fwmark && - #endif -+ ve_accessible_strict(rth->fl.owner_env, get_exec_env()) && - !((rth->fl.fl4_tos ^ flp->fl4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK))) { - -@@ -2705,7 +2868,7 @@ static int rt_fill_info(struct sk_buff * - u32 dst = rt->rt_dst; - - if (MULTICAST(dst) && !LOCAL_MCAST(dst) && -- ipv4_devconf.mc_forwarding) { -+ ve_ipv4_devconf.mc_forwarding) { - int err = ipmr_get_route(skb, r, nowait); - if (err <= 0) { - if (!nowait) { -@@ -2750,7 +2913,10 @@ int inet_rtm_getroute(struct sk_buff *in - /* Reserve room for dummy headers, this skb can pass - through good chunk of routing engine. - */ -- skb->mac.raw = skb->data; -+ skb->mac.raw = skb->nh.raw = skb->data; -+ -+ /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ -+ skb->nh.iph->protocol = IPPROTO_ICMP; - skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); - - if (rta[RTA_SRC - 1]) -@@ -2853,22 +3019,22 @@ void ip_rt_multicast_event(struct in_dev - } - - #ifdef CONFIG_SYSCTL --static int flush_delay; -+int ipv4_flush_delay; - --static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, -+int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, - size_t *lenp, loff_t *ppos) - { - if (write) { - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); -- rt_cache_flush(flush_delay); -+ rt_cache_flush(ipv4_flush_delay); - return 0; - } - - return -EINVAL; - } - --static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, -+int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - int __user *name, - int nlen, - void __user *oldval, -@@ -2890,7 +3056,7 @@ ctl_table ipv4_route_table[] = { - { - .ctl_name = NET_IPV4_ROUTE_FLUSH, - .procname = "flush", -- .data = &flush_delay, -+ .data = &ipv4_flush_delay, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = &ipv4_sysctl_rtcache_flush, -@@ -3184,15 +3350,18 @@ int __init ip_rt_init(void) - #ifdef CONFIG_PROC_FS - { - struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ -- if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || -- !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, -- proc_net_stat))) { -+ -+ if (!proc_glob_fops_create("net/rt_cache", -+ S_IRUGO, &rt_cache_seq_fops)) -+ return -ENOMEM; -+ -+ if (!(rtstat_pde = create_proc_glob_entry("net/stat/rt_cache", -+ S_IRUGO, NULL))) - return -ENOMEM; -- } - rtstat_pde->proc_fops = &rt_cpu_seq_fops; - } - #ifdef CONFIG_NET_CLS_ROUTE -- create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); -+ create_proc_read_entry("net/rt_acct", 0, NULL, ip_rt_acct_read, NULL); - #endif - #endif - #ifdef CONFIG_XFRM -diff -upr linux-2.6.16.orig/net/ipv4/sysctl_net_ipv4.c linux-2.6.16-026test009/net/ipv4/sysctl_net_ipv4.c ---- linux-2.6.16.orig/net/ipv4/sysctl_net_ipv4.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/sysctl_net_ipv4.c 2006-04-19 15:02:12.000000000 +0400 -@@ -33,22 +33,21 @@ struct ipv4_config ipv4_config; - - #ifdef CONFIG_SYSCTL - --static - int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) - { -- int val = ipv4_devconf.forwarding; -+ int val = ve_ipv4_devconf.forwarding; - int ret; - - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - -- if (write && ipv4_devconf.forwarding != val) -+ if (write && ve_ipv4_devconf.forwarding != val) - inet_forward_change(); - - return ret; - } - --static int ipv4_sysctl_forward_strategy(ctl_table *table, -+int ipv4_sysctl_forward_strategy(ctl_table *table, - int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, -diff -upr linux-2.6.16.orig/net/ipv4/tcp.c linux-2.6.16-026test009/net/ipv4/tcp.c ---- linux-2.6.16.orig/net/ipv4/tcp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/tcp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -248,6 +248,7 @@ - */ - - #include <linux/config.h> -+#include <linux/kmem_cache.h> - #include <linux/module.h> - #include <linux/types.h> - #include <linux/fcntl.h> -@@ -263,6 +264,9 @@ - #include <net/xfrm.h> - #include <net/ip.h> - -+#include <ub/ub_orphan.h> -+#include <ub/ub_net.h> -+#include <ub/ub_tcp.h> - - #include <asm/uaccess.h> - #include <asm/ioctls.h> -@@ -321,6 +325,7 @@ unsigned int tcp_poll(struct file *file, - unsigned int mask; - struct sock *sk = sock->sk; - struct tcp_sock *tp = tcp_sk(sk); -+ int check_send_space; - - poll_wait(file, sk->sk_sleep, wait); - if (sk->sk_state == TCP_LISTEN) -@@ -335,6 +340,21 @@ unsigned int tcp_poll(struct file *file, - if (sk->sk_err) - mask = POLLERR; - -+ check_send_space = 1; -+#ifdef CONFIG_USER_RESOURCE -+ if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) { -+ unsigned long size; -+ size = MAX_TCP_HEADER + tp->mss_cache; -+ if (size > SOCK_MIN_UBCSPACE) -+ size = SOCK_MIN_UBCSPACE; -+ size = skb_charge_size(size); -+ if (ub_sock_makewres_tcp(sk, size)) { -+ check_send_space = 0; -+ ub_sock_sndqueueadd_tcp(sk, size); -+ } -+ } -+#endif -+ - /* - * POLLHUP is certainly not done right. But poll() doesn't - * have a notion of HUP in just one direction, and for a -@@ -378,7 +398,7 @@ unsigned int tcp_poll(struct file *file, - sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) - mask |= POLLIN | POLLRDNORM; - -- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { -+ if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { - mask |= POLLOUT | POLLWRNORM; - } else { /* send SIGIO later */ -@@ -528,16 +548,23 @@ static ssize_t do_tcp_sendpages(struct s - int copy, i, can_coalesce; - int offset = poffset % PAGE_SIZE; - int size = min_t(size_t, psize, PAGE_SIZE - offset); -+ unsigned long chargesize = 0; - - if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { - new_segment: -+ chargesize = 0; - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - -+ chargesize = skb_charge_size(MAX_TCP_HEADER + -+ tp->mss_cache); -+ if (ub_sock_getwres_tcp(sk, chargesize) < 0) -+ goto wait_for_ubspace; - skb = sk_stream_alloc_pskb(sk, 0, 0, - sk->sk_allocation); - if (!skb) - goto wait_for_memory; -+ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); - - skb_entail(sk, tp, skb); - copy = size_goal; -@@ -593,10 +620,14 @@ new_segment: - wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - wait_for_memory: -+ ub_sock_retwres_tcp(sk, chargesize, -+ skb_charge_size(MAX_TCP_HEADER + tp->mss_cache)); -+ chargesize = 0; -+wait_for_ubspace: - if (copied) - tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - -- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) -+ if ((err = sk_stream_wait_memory(sk, &timeo, chargesize)) != 0) - goto do_error; - - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); -@@ -699,6 +730,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru - while (--iovlen >= 0) { - int seglen = iov->iov_len; - unsigned char __user *from = iov->iov_base; -+ unsigned long chargesize = 0; - - iov++; - -@@ -709,18 +741,26 @@ int tcp_sendmsg(struct kiocb *iocb, stru - - if (!sk->sk_send_head || - (copy = size_goal - skb->len) <= 0) { -+ unsigned long size; - - new_segment: - /* Allocate new segment. If the interface is SG, - * allocate skb fitting to single page. - */ -+ chargesize = 0; - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; -- -- skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), -- 0, sk->sk_allocation); -+ size = select_size(sk, tp); -+ chargesize = skb_charge_size(MAX_TCP_HEADER + -+ size); -+ if (ub_sock_getwres_tcp(sk, chargesize) < 0) -+ goto wait_for_ubspace; -+ skb = sk_stream_alloc_pskb(sk, size, 0, -+ sk->sk_allocation); - if (!skb) - goto wait_for_memory; -+ ub_skb_set_charge(skb, sk, chargesize, -+ UB_TCPSNDBUF); - - /* - * Check whether we can use HW checksum. -@@ -768,6 +808,7 @@ new_segment: - } else if (page) { - if (off == PAGE_SIZE) { - put_page(page); -+ ub_sock_tcp_detachpage(sk); - TCP_PAGE(sk) = page = NULL; - off = 0; - } -@@ -781,6 +822,9 @@ new_segment: - goto wait_for_memory; - - if (!page) { -+ chargesize = PAGE_SIZE; -+ if (ub_sock_tcp_chargepage(sk) < 0) -+ goto wait_for_ubspace; - /* Allocate new cache page. */ - if (!(page = sk_stream_alloc_page(sk))) - goto wait_for_memory; -@@ -812,7 +856,8 @@ new_segment: - } else if (off + copy < PAGE_SIZE) { - get_page(page); - TCP_PAGE(sk) = page; -- } -+ } else -+ ub_sock_tcp_detachpage(sk); - } - - TCP_OFF(sk) = off + copy; -@@ -843,10 +888,15 @@ new_segment: - wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - wait_for_memory: -+ ub_sock_retwres_tcp(sk, chargesize, -+ skb_charge_size(MAX_TCP_HEADER+tp->mss_cache)); -+ chargesize = 0; -+wait_for_ubspace: - if (copied) - tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - -- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) -+ if ((err = sk_stream_wait_memory(sk, &timeo, -+ chargesize)) != 0) - goto do_error; - - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); -@@ -944,7 +994,18 @@ static void cleanup_rbuf(struct sock *sk - #if TCP_DEBUG - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - -- BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); -+ if (!(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) { -+ printk("KERNEL: assertion: skb==NULL || " -+ "before(tp->copied_seq, skb->end_seq)\n"); -+ printk("VE%u pid %d comm %.16s\n", -+ (get_exec_env() ? VEID(get_exec_env()) : 0), -+ current->pid, current->comm); -+ printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied, -+ tp->copied_seq, tp->rcv_nxt); -+ printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n", -+ skb->len, TCP_SKB_CB(skb)->seq, -+ TCP_SKB_CB(skb)->end_seq); -+ } - #endif - - if (inet_csk_ack_scheduled(sk)) { -@@ -1168,7 +1229,22 @@ int tcp_recvmsg(struct kiocb *iocb, stru - goto found_ok_skb; - if (skb->h.th->fin) - goto found_fin_ok; -- BUG_TRAP(flags & MSG_PEEK); -+ if (!(flags & MSG_PEEK)) { -+ printk("KERNEL: assertion: flags&MSG_PEEK\n"); -+ printk("VE%u pid %d comm %.16s\n", -+ (get_exec_env() ? -+ VEID(get_exec_env()) : 0), -+ current->pid, current->comm); -+ printk("flags=0x%x, len=%d, copied_seq=%d, " -+ "rcv_nxt=%d\n", flags, len, -+ tp->copied_seq, tp->rcv_nxt); -+ printk("skb->len=%d, *seq=%d, skb->seq=%d, " -+ "skb->end_seq=%d, offset=%d\n", -+ skb->len, *seq, -+ TCP_SKB_CB(skb)->seq, -+ TCP_SKB_CB(skb)->end_seq, -+ offset); -+ } - skb = skb->next; - } while (skb != (struct sk_buff *)&sk->sk_receive_queue); - -@@ -1231,8 +1307,18 @@ int tcp_recvmsg(struct kiocb *iocb, stru - - tp->ucopy.len = len; - -- BUG_TRAP(tp->copied_seq == tp->rcv_nxt || -- (flags & (MSG_PEEK | MSG_TRUNC))); -+ if (!(tp->copied_seq == tp->rcv_nxt || -+ (flags&(MSG_PEEK|MSG_TRUNC)))) { -+ printk("KERNEL: assertion: tp->copied_seq == " -+ "tp->rcv_nxt || ...\n"); -+ printk("VE%u pid %d comm %.16s\n", -+ (get_exec_env() ? -+ VEID(get_exec_env()) : 0), -+ current->pid, current->comm); -+ printk("flags=0x%x, len=%d, copied_seq=%d, " -+ "rcv_nxt=%d\n", flags, len, -+ tp->copied_seq, tp->rcv_nxt); -+ } - - /* Ugly... If prequeue is not empty, we have to - * process it before releasing socket, otherwise -@@ -1583,7 +1669,7 @@ adjudge_to_death: - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); - } else { -- atomic_inc(sk->sk_prot->orphan_count); -+ ub_inc_orphan_count(sk); - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); - goto out; - } -@@ -1591,9 +1677,7 @@ adjudge_to_death: - } - if (sk->sk_state != TCP_CLOSE) { - sk_stream_mem_reclaim(sk); -- if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || -- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && -- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { -+ if (ub_too_many_orphans(sk, ub_get_orphan_count(sk))) { - if (net_ratelimit()) - printk(KERN_INFO "TCP: too many of orphaned " - "sockets\n"); -@@ -1602,7 +1686,7 @@ adjudge_to_death: - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); - } - } -- atomic_inc(sk->sk_prot->orphan_count); -+ ub_inc_orphan_count(sk); - - if (sk->sk_state == TCP_CLOSE) - inet_csk_destroy_sock(sk); -@@ -2051,7 +2135,7 @@ void __init tcp_init(void) - tcp_hashinfo.bind_bucket_cachep = - kmem_cache_create("tcp_bind_bucket", - sizeof(struct inet_bind_bucket), 0, -- SLAB_HWCACHE_ALIGN, NULL, NULL); -+ SLAB_HWCACHE_ALIGN | SLAB_UBC, NULL, NULL); - if (!tcp_hashinfo.bind_bucket_cachep) - panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); - -diff -upr linux-2.6.16.orig/net/ipv4/tcp_input.c linux-2.6.16-026test009/net/ipv4/tcp_input.c ---- linux-2.6.16.orig/net/ipv4/tcp_input.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/tcp_input.c 2006-04-19 15:02:12.000000000 +0400 -@@ -72,6 +72,8 @@ - #include <linux/ipsec.h> - #include <asm/unaligned.h> - -+#include <ub/ub_tcp.h> -+ - int sysctl_tcp_timestamps = 1; - int sysctl_tcp_window_scaling = 1; - int sysctl_tcp_sack = 1; -@@ -252,7 +254,7 @@ static void tcp_grow_window(struct sock - /* Check #1 */ - if (tp->rcv_ssthresh < tp->window_clamp && - (int)tp->rcv_ssthresh < tcp_space(sk) && -- !tcp_memory_pressure) { -+ ub_tcp_rmem_allows_expand(sk)) { - int incr; - - /* Check #2. Increase window, if skb with such overhead -@@ -321,6 +323,8 @@ static void tcp_init_buffer_space(struct - - tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; -+ -+ ub_tcp_update_maxadvmss(sk); - } - - /* 5. Recalculate window clamp after socket hit its memory bounds. */ -@@ -332,7 +336,7 @@ static void tcp_clamp_window(struct sock - - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && -- !tcp_memory_pressure && -+ !ub_tcp_memory_pressure(sk) && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); -@@ -3118,7 +3122,7 @@ queue_and_out: - !sk_stream_rmem_schedule(sk, skb))) { - if (tcp_prune_queue(sk) < 0 || - !sk_stream_rmem_schedule(sk, skb)) -- goto drop; -+ goto drop_part; - } - sk_stream_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); -@@ -3162,6 +3166,12 @@ out_of_window: - drop: - __kfree_skb(skb); - return; -+ -+drop_part: -+ if (after(tp->copied_seq, tp->rcv_nxt)) -+ tp->rcv_nxt = tp->copied_seq; -+ __kfree_skb(skb); -+ return; - } - - /* Out of window. F.e. zero window probe. */ -@@ -3333,6 +3343,10 @@ tcp_collapse(struct sock *sk, struct sk_ - nskb = alloc_skb(copy+header, GFP_ATOMIC); - if (!nskb) - return; -+ if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) { -+ kfree_skb(nskb); -+ return; -+ } - skb_reserve(nskb, header); - memcpy(nskb->head, skb->head, header); - nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); -@@ -3429,7 +3443,7 @@ static int tcp_prune_queue(struct sock * - - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - tcp_clamp_window(sk, tp); -- else if (tcp_memory_pressure) -+ else if (ub_tcp_memory_pressure(sk)) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); - - tcp_collapse_ofo_queue(sk); -@@ -3505,7 +3519,7 @@ static int tcp_should_expand_sndbuf(stru - return 0; - - /* If we are under global TCP memory pressure, do not expand. */ -- if (tcp_memory_pressure) -+ if (ub_tcp_memory_pressure(sk)) - return 0; - - /* If we are under soft global TCP memory pressure, do not expand. */ -@@ -3898,6 +3912,10 @@ int tcp_rcv_established(struct sock *sk, - - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; -+ /* This is OK not to try to free memory here. -+ * Do this below on slow path. Den */ -+ if (ub_tcprcvbuf_charge(sk, skb) < 0) -+ goto step5; - - NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); - -diff -upr linux-2.6.16.orig/net/ipv4/tcp_ipv4.c linux-2.6.16-026test009/net/ipv4/tcp_ipv4.c ---- linux-2.6.16.orig/net/ipv4/tcp_ipv4.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/tcp_ipv4.c 2006-04-19 15:02:13.000000000 +0400 -@@ -72,6 +72,8 @@ - #include <net/timewait_sock.h> - #include <net/xfrm.h> - -+#include <ub/ub_tcp.h> -+ - #include <linux/inet.h> - #include <linux/ipv6.h> - #include <linux/stddef.h> -@@ -705,6 +707,7 @@ struct request_sock_ops tcp_request_sock - .destructor = tcp_v4_reqsk_destructor, - .send_reset = tcp_v4_send_reset, - }; -+EXPORT_SYMBOL_GPL(tcp_request_sock_ops); - - static struct timewait_sock_ops tcp_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct tcp_timewait_sock), -@@ -979,12 +982,15 @@ static int tcp_v4_checksum_init(struct s - */ - int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) - { -+ struct user_beancounter *ub; -+ -+ ub = set_exec_ub(sock_bc(sk)->ub); - if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - TCP_CHECK_TIMER(sk); - if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) - goto reset; - TCP_CHECK_TIMER(sk); -- return 0; -+ goto restore_context; - } - - if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) -@@ -998,7 +1004,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc - if (nsk != sk) { - if (tcp_child_process(sk, nsk, skb)) - goto reset; -- return 0; -+ goto restore_context; - } - } - -@@ -1006,6 +1012,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc - if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) - goto reset; - TCP_CHECK_TIMER(sk); -+ -+restore_context: -+ (void)set_exec_ub(ub); - return 0; - - reset: -@@ -1017,7 +1026,7 @@ discard: - * might be destroyed here. This current version compiles correctly, - * but you have been warned. - */ -- return 0; -+ goto restore_context; - - csum_err: - TCP_INC_STATS_BH(TCP_MIB_INERRS); -@@ -1302,6 +1311,8 @@ int tcp_v4_destroy_sock(struct sock *sk) - * If sendmsg cached page exists, toss it. - */ - if (sk->sk_sndmsg_page) { -+ /* queue is empty, uncharge */ -+ ub_sock_tcp_detachpage(sk); - __free_page(sk->sk_sndmsg_page); - sk->sk_sndmsg_page = NULL; - } -@@ -1316,16 +1327,34 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); - #ifdef CONFIG_PROC_FS - /* Proc filesystem TCP sock list dumping. */ - --static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) -+static inline struct inet_timewait_sock *tw_head(struct hlist_head *head, -+ envid_t veid) - { -- return hlist_empty(head) ? NULL : -- list_entry(head->first, struct inet_timewait_sock, tw_node); -+ struct inet_timewait_sock *tw; -+ struct hlist_node *pos; -+ -+ if (hlist_empty(head)) -+ return NULL; -+ hlist_for_each_entry(tw, pos, head, tw_node) { -+ if (!ve_accessible_veid(tw->tw_owner_env, veid)) -+ continue; -+ return tw; -+ } -+ return NULL; - } - --static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) -+static inline struct inet_timewait_sock * -+ tw_next(struct inet_timewait_sock *tw, envid_t veid) - { -- return tw->tw_node.next ? -- hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; -+ while (1) { -+ if (tw->tw_node.next == NULL) -+ return NULL; -+ tw = hlist_entry(tw->tw_node.next, typeof(*tw), tw_node); -+ if (!ve_accessible_veid(tw->tw_owner_env, veid)) -+ continue; -+ return tw; -+ } -+ return NULL; /* make compiler happy */ - } - - static void *listening_get_next(struct seq_file *seq, void *cur) -@@ -1334,7 +1363,9 @@ static void *listening_get_next(struct s - struct hlist_node *node; - struct sock *sk = cur; - struct tcp_iter_state* st = seq->private; -+ struct ve_struct *ve; - -+ ve = get_exec_env(); - if (!sk) { - st->bucket = 0; - sk = sk_head(&tcp_hashinfo.listening_hash[0]); -@@ -1374,6 +1405,8 @@ get_req: - } - get_sk: - sk_for_each_from(sk, node) { -+ if (!ve_accessible(VE_OWNER_SK(sk), ve)) -+ continue; - if (sk->sk_family == st->family) { - cur = sk; - goto out; -@@ -1414,7 +1447,9 @@ static void *established_get_first(struc - { - struct tcp_iter_state* st = seq->private; - void *rc = NULL; -+ struct ve_struct *ve; - -+ ve = get_exec_env(); - for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { - struct sock *sk; - struct hlist_node *node; -@@ -1425,6 +1460,8 @@ static void *established_get_first(struc - - read_lock(&tcp_hashinfo.ehash[st->bucket].lock); - sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { -+ if (!ve_accessible(VE_OWNER_SK(sk), ve)) -+ continue; - if (sk->sk_family != st->family) { - continue; - } -@@ -1434,6 +1471,8 @@ static void *established_get_first(struc - st->state = TCP_SEQ_STATE_TIME_WAIT; - inet_twsk_for_each(tw, node, - &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { -+ if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) -+ continue; - if (tw->tw_family != st->family) { - continue; - } -@@ -1453,16 +1492,17 @@ static void *established_get_next(struct - struct inet_timewait_sock *tw; - struct hlist_node *node; - struct tcp_iter_state* st = seq->private; -+ struct ve_struct *ve; - -+ ve = get_exec_env(); - ++st->num; - - if (st->state == TCP_SEQ_STATE_TIME_WAIT) { - tw = cur; -- tw = tw_next(tw); -+ tw = tw_next(tw, VEID(ve)); - get_tw: -- while (tw && tw->tw_family != st->family) { -- tw = tw_next(tw); -- } -+ while (tw && tw->tw_family != st->family) -+ tw = tw_next(tw, VEID(ve)); - if (tw) { - cur = tw; - goto out; -@@ -1484,12 +1524,15 @@ get_tw: - sk = sk_next(sk); - - sk_for_each_from(sk, node) { -+ if (!ve_accessible(VE_OWNER_SK(sk), ve)) -+ continue; - if (sk->sk_family == st->family) - goto found; - } - - st->state = TCP_SEQ_STATE_TIME_WAIT; -- tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain); -+ tw = tw_head(&tcp_hashinfo.ehash[st->bucket + -+ tcp_hashinfo.ehash_size].chain, VEID(ve)); - goto get_tw; - found: - cur = sk; -@@ -1635,7 +1678,12 @@ int tcp_proc_register(struct tcp_seq_afi - afinfo->seq_fops->llseek = seq_lseek; - afinfo->seq_fops->release = seq_release_private; - -- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); -+ if (*(afinfo->name) == 'n') -+ p = proc_glob_fops_create(afinfo->name, S_IRUGO, -+ afinfo->seq_fops); -+ else -+ p = proc_net_fops_create(afinfo->name, S_IRUGO, -+ afinfo->seq_fops); - if (p) - p->data = afinfo; - else -@@ -1647,7 +1695,10 @@ void tcp_proc_unregister(struct tcp_seq_ - { - if (!afinfo) - return; -- proc_net_remove(afinfo->name); -+ if (*(afinfo->name) == 'n') -+ remove_proc_glob_entry(afinfo->name, NULL); -+ else -+ proc_net_remove(afinfo->name); - memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); - } - -@@ -1777,7 +1828,7 @@ out: - static struct file_operations tcp4_seq_fops; - static struct tcp_seq_afinfo tcp4_seq_afinfo = { - .owner = THIS_MODULE, -- .name = "tcp", -+ .name = "net/tcp", - .family = AF_INET, - .seq_show = tcp4_seq_show, - .seq_fops = &tcp4_seq_fops, -@@ -1844,6 +1895,86 @@ void __init tcp_v4_init(struct net_proto - tcp_socket->sk->sk_prot->unhash(tcp_socket->sk); - } - -+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) -+static void tcp_kill_ve_onesk(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ /* Check the assumed state of the socket. */ -+ if (!sock_flag(sk, SOCK_DEAD)) { -+ static int printed; -+invalid: -+ if (!printed) -+ printk(KERN_DEBUG "Killing sk: dead %d, state %d, " -+ "wrseq %u unseq %u, wrqu %d.\n", -+ sock_flag(sk, SOCK_DEAD), sk->sk_state, -+ tp->write_seq, tp->snd_una, -+ !skb_queue_empty(&sk->sk_write_queue)); -+ printed = 1; -+ return; -+ } -+ -+ tcp_send_active_reset(sk, GFP_ATOMIC); -+ switch (sk->sk_state) { -+ case TCP_FIN_WAIT1: -+ case TCP_CLOSING: -+ /* In these 2 states the peer may want us to retransmit -+ * some data and/or FIN. Entering "resetting mode" -+ * instead. -+ */ -+ tcp_time_wait(sk, TCP_CLOSE, 0); -+ break; -+ case TCP_FIN_WAIT2: -+ /* By some reason the socket may stay in this state -+ * without turning into a TW bucket. Fix it. -+ */ -+ tcp_time_wait(sk, TCP_FIN_WAIT2, 0); -+ break; -+ case TCP_LAST_ACK: -+ /* Just jump into CLOSED state. */ -+ tcp_done(sk); -+ break; -+ default: -+ /* The socket must be already close()d. */ -+ goto invalid; -+ } -+} -+ -+void tcp_v4_kill_ve_sockets(struct ve_struct *envid) -+{ -+ struct inet_ehash_bucket *head; -+ int i; -+ -+ /* alive */ -+ local_bh_disable(); -+ head = tcp_hashinfo.ehash; -+ for (i = 0; i < tcp_hashinfo.ehash_size; i++) { -+ struct sock *sk; -+ struct hlist_node *node; -+more_work: -+ write_lock(&head[i].lock); -+ sk_for_each(sk, node, &head[i].chain) { -+ if (ve_accessible_strict(VE_OWNER_SK(sk), envid)) { -+ sock_hold(sk); -+ write_unlock(&head[i].lock); -+ -+ bh_lock_sock(sk); -+ /* sk might have disappeared from the hash before -+ * we got the lock */ -+ if (sk->sk_state != TCP_CLOSE) -+ tcp_kill_ve_onesk(sk); -+ bh_unlock_sock(sk); -+ sock_put(sk); -+ goto more_work; -+ } -+ } -+ write_unlock(&head[i].lock); -+ } -+ local_bh_enable(); -+} -+EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); -+#endif -+ - EXPORT_SYMBOL(ipv4_specific); - EXPORT_SYMBOL(tcp_hashinfo); - EXPORT_SYMBOL(tcp_prot); -diff -upr linux-2.6.16.orig/net/ipv4/tcp_minisocks.c linux-2.6.16-026test009/net/ipv4/tcp_minisocks.c ---- linux-2.6.16.orig/net/ipv4/tcp_minisocks.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/tcp_minisocks.c 2006-04-19 15:02:12.000000000 +0400 -@@ -29,6 +29,8 @@ - #include <net/inet_common.h> - #include <net/xfrm.h> - -+#include <ub/ub_net.h> -+ - #ifdef CONFIG_SYSCTL - #define SYNC_INIT 0 /* let the user enable it */ - #else -@@ -307,6 +309,8 @@ void tcp_time_wait(struct sock *sk, int - tw->tw_ipv6only = np->ipv6only; - } - #endif -+ tw->tw_owner_env = VEID(VE_OWNER_SK(sk)); -+ - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - -@@ -355,6 +359,8 @@ struct sock *tcp_create_openreq_child(st - struct tcp_sock *newtp; - - /* Now setup tcp_sock */ -+ SET_VE_OWNER_SK(newsk, VE_OWNER_SK(sk)); -+ - newtp = tcp_sk(newsk); - newtp->pred_flags = 0; - newtp->rcv_nxt = treq->rcv_isn + 1; -diff -upr linux-2.6.16.orig/net/ipv4/tcp_output.c linux-2.6.16-026test009/net/ipv4/tcp_output.c ---- linux-2.6.16.orig/net/ipv4/tcp_output.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/tcp_output.c 2006-04-19 15:02:12.000000000 +0400 -@@ -42,6 +42,9 @@ - #include <linux/module.h> - #include <linux/smp_lock.h> - -+#include <ub/ub_net.h> -+#include <ub/ub_tcp.h> -+ - /* People can turn this off for buggy TCP's found in printers etc. */ - int sysctl_tcp_retrans_collapse = 1; - -@@ -528,15 +531,23 @@ int tcp_fragment(struct sock *sk, struct - if (nsize < 0) - nsize = 0; - -- if (skb_cloned(skb) && -- skb_is_nonlinear(skb) && -- pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) -- return -ENOMEM; -+ if (skb_cloned(skb) && skb_is_nonlinear(skb)) { -+ unsigned long chargesize; -+ chargesize = skb_bc(skb)->charged; -+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) -+ return -ENOMEM; -+ ub_sock_retwres_tcp(sk, chargesize, chargesize); -+ ub_tcpsndbuf_charge_forced(sk, skb); -+ } - - /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); - if (buff == NULL) - return -ENOMEM; /* We'll just try again later. */ -+ if (ub_tcpsndbuf_charge(sk, buff) < 0) { -+ kfree_skb(buff); -+ return -ENOMEM; -+ } - sk_charge_skb(sk, buff); - - /* Correct the sequence numbers. */ -@@ -978,6 +989,11 @@ static int tso_fragment(struct sock *sk, - if (unlikely(buff == NULL)) - return -ENOMEM; - -+ if (ub_tcpsndbuf_charge(sk, buff) < 0) { -+ kfree_skb(buff); -+ return -ENOMEM; -+ } -+ - buff->truesize = nlen; - skb->truesize -= nlen; - -@@ -1281,7 +1297,7 @@ u32 __tcp_select_window(struct sock *sk) - if (free_space < full_space/2) { - icsk->icsk_ack.quick = 0; - -- if (tcp_memory_pressure) -+ if (ub_tcp_shrink_rcvbuf(sk)) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); - - if (free_space < mss) -@@ -1708,6 +1724,7 @@ void tcp_send_fin(struct sock *sk) - break; - yield(); - } -+ ub_tcpsndbuf_charge_forced(sk, skb); - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_TCP_HEADER); -@@ -1777,6 +1794,10 @@ int tcp_send_synack(struct sock *sk) - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); - if (nskb == NULL) - return -ENOMEM; -+ if (ub_tcpsndbuf_charge(sk, skb) < 0) { -+ kfree_skb(nskb); -+ return -ENOMEM; -+ } - __skb_unlink(skb, &sk->sk_write_queue); - skb_header_release(nskb); - __skb_queue_head(&sk->sk_write_queue, nskb); -@@ -1928,6 +1949,10 @@ int tcp_connect(struct sock *sk) - buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); - if (unlikely(buff == NULL)) - return -ENOBUFS; -+ if (ub_tcpsndbuf_charge(sk, buff) < 0) { -+ kfree_skb(buff); -+ return -ENOBUFS; -+ } - - /* Reserve space for headers. */ - skb_reserve(buff, MAX_TCP_HEADER); -diff -upr linux-2.6.16.orig/net/ipv4/tcp_timer.c linux-2.6.16-026test009/net/ipv4/tcp_timer.c ---- linux-2.6.16.orig/net/ipv4/tcp_timer.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/tcp_timer.c 2006-04-19 15:02:12.000000000 +0400 -@@ -22,6 +22,8 @@ - - #include <linux/module.h> - #include <net/tcp.h> -+#include <ub/ub_orphan.h> -+#include <ub/ub_tcp.h> - - int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; - int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; -@@ -67,7 +69,7 @@ static void tcp_write_err(struct sock *s - static int tcp_out_of_resources(struct sock *sk, int do_reset) - { - struct tcp_sock *tp = tcp_sk(sk); -- int orphans = atomic_read(&tcp_orphan_count); -+ int orphans = ub_get_orphan_count(sk); - - /* If peer does not open window for long time, or did not transmit - * anything for long time, penalize it. */ -@@ -78,9 +80,7 @@ static int tcp_out_of_resources(struct s - if (sk->sk_err_soft) - orphans <<= 1; - -- if (orphans >= sysctl_tcp_max_orphans || -- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && -- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { -+ if (ub_too_many_orphans(sk, orphans)) { - if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); - -@@ -173,9 +173,12 @@ static int tcp_write_timeout(struct sock - static void tcp_delack_timer(unsigned long data) - { - struct sock *sk = (struct sock*)data; -+ struct ve_struct *env; - struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - -+ env = set_exec_env(VE_OWNER_SK(sk)); -+ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ -@@ -224,11 +227,12 @@ static void tcp_delack_timer(unsigned lo - TCP_CHECK_TIMER(sk); - - out: -- if (tcp_memory_pressure) -+ if (ub_tcp_memory_pressure(sk)) - sk_stream_mem_reclaim(sk); - out_unlock: - bh_unlock_sock(sk); - sock_put(sk); -+ (void)set_exec_env(env); - } - - static void tcp_probe_timer(struct sock *sk) -@@ -283,8 +287,11 @@ static void tcp_probe_timer(struct sock - static void tcp_retransmit_timer(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); -+ struct ve_struct *env; - struct inet_connection_sock *icsk = inet_csk(sk); - -+ env = set_exec_env(VE_OWNER_SK(sk)); -+ - if (!tp->packets_out) - goto out; - -@@ -381,15 +388,19 @@ out_reset_timer: - if (icsk->icsk_retransmits > sysctl_tcp_retries1) - __sk_dst_reset(sk); - --out:; -+out: -+ (void)set_exec_env(env); - } - - static void tcp_write_timer(unsigned long data) - { - struct sock *sk = (struct sock*)data; -+ struct ve_struct *env; - struct inet_connection_sock *icsk = inet_csk(sk); - int event; - -+ env = set_exec_env(VE_OWNER_SK(sk)); -+ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later */ -@@ -423,6 +434,7 @@ out: - out_unlock: - bh_unlock_sock(sk); - sock_put(sk); -+ (void)set_exec_env(env); - } - - /* -@@ -450,10 +462,13 @@ void tcp_set_keepalive(struct sock *sk, - static void tcp_keepalive_timer (unsigned long data) - { - struct sock *sk = (struct sock *) data; -+ struct ve_struct *env; - struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - __u32 elapsed; - -+ env = set_exec_env(VE_OWNER_SK(sk)); -+ - /* Only process if socket is not in use. */ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { -@@ -525,4 +540,5 @@ death: - out: - bh_unlock_sock(sk); - sock_put(sk); -+ (void)set_exec_env(env); - } -diff -upr linux-2.6.16.orig/net/ipv4/udp.c linux-2.6.16-026test009/net/ipv4/udp.c ---- linux-2.6.16.orig/net/ipv4/udp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv4/udp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -127,7 +127,9 @@ static int udp_v4_get_port(struct sock * - struct hlist_node *node; - struct sock *sk2; - struct inet_sock *inet = inet_sk(sk); -+ struct ve_struct *env; - -+ env = VE_OWNER_SK(sk); - write_lock_bh(&udp_hash_lock); - if (snum == 0) { - int best_size_so_far, best, result, i; -@@ -141,7 +143,7 @@ static int udp_v4_get_port(struct sock * - struct hlist_head *list; - int size; - -- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; -+ list = &udp_hash[udp_hashfn(result, VEID(env))]; - if (hlist_empty(list)) { - if (result > sysctl_local_port_range[1]) - result = sysctl_local_port_range[0] + -@@ -163,7 +165,7 @@ static int udp_v4_get_port(struct sock * - result = sysctl_local_port_range[0] - + ((result - sysctl_local_port_range[0]) & - (UDP_HTABLE_SIZE - 1)); -- if (!udp_lport_inuse(result)) -+ if (!udp_lport_inuse(result, env)) - break; - } - if (i >= (1 << 16) / UDP_HTABLE_SIZE) -@@ -172,11 +174,12 @@ gotit: - udp_port_rover = snum = result; - } else { - sk_for_each(sk2, node, -- &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { -+ &udp_hash[udp_hashfn(snum, VEID(env))]) { - struct inet_sock *inet2 = inet_sk(sk2); - - if (inet2->num == snum && - sk2 != sk && -+ ve_accessible_strict(VE_OWNER_SK(sk2), env) && - !ipv6_only_sock(sk2) && - (!sk2->sk_bound_dev_if || - !sk->sk_bound_dev_if || -@@ -190,7 +193,7 @@ gotit: - } - inet->num = snum; - if (sk_unhashed(sk)) { -- struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; -+ struct hlist_head *h = &udp_hash[udp_hashfn(snum, VEID(env))]; - - sk_add_node(sk, h); - sock_prot_inc_use(sk->sk_prot); -@@ -228,11 +231,15 @@ static struct sock *udp_v4_lookup_longwa - struct hlist_node *node; - unsigned short hnum = ntohs(dport); - int badness = -1; -+ struct ve_struct *env; - -- sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { -+ env = get_exec_env(); -+ sk_for_each(sk, node, &udp_hash[udp_hashfn(hnum, VEID(env))]) { - struct inet_sock *inet = inet_sk(sk); - -- if (inet->num == hnum && !ipv6_only_sock(sk)) { -+ if (inet->num == hnum && -+ ve_accessible_strict(VE_OWNER_SK(sk), env) && -+ !ipv6_only_sock(sk)) { - int score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) -@@ -1049,7 +1056,8 @@ static int udp_v4_mcast_deliver(struct s - int dif; - - read_lock(&udp_hash_lock); -- sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); -+ sk = sk_head(&udp_hash[udp_hashfn(ntohs(uh->dest), -+ VEID(VE_OWNER_SKB(skb)))]); - dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { -@@ -1367,10 +1375,14 @@ static struct sock *udp_get_first(struct - { - struct sock *sk; - struct udp_iter_state *state = seq->private; -+ struct ve_struct *env; - -+ env = get_exec_env(); - for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { - struct hlist_node *node; - sk_for_each(sk, node, &udp_hash[state->bucket]) { -+ if (!ve_accessible(VE_OWNER_SK(sk), env)) -+ continue; - if (sk->sk_family == state->family) - goto found; - } -@@ -1387,8 +1399,13 @@ static struct sock *udp_get_next(struct - do { - sk = sk_next(sk); - try_again: -- ; -- } while (sk && sk->sk_family != state->family); -+ if (!sk) -+ break; -+ if (sk->sk_family != state->family) -+ continue; -+ if (ve_accessible(VE_OWNER_SK(sk), get_exec_env())) -+ break; -+ } while (1); - - if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { - sk = sk_head(&udp_hash[state->bucket]); -@@ -1474,7 +1491,12 @@ int udp_proc_register(struct udp_seq_afi - afinfo->seq_fops->llseek = seq_lseek; - afinfo->seq_fops->release = seq_release_private; - -- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); -+ if (*(afinfo->name) == 'n') -+ p = proc_glob_fops_create(afinfo->name, S_IRUGO, -+ afinfo->seq_fops); -+ else -+ p = proc_net_fops_create(afinfo->name, S_IRUGO, -+ afinfo->seq_fops); - if (p) - p->data = afinfo; - else -@@ -1486,7 +1508,10 @@ void udp_proc_unregister(struct udp_seq_ - { - if (!afinfo) - return; -- proc_net_remove(afinfo->name); -+ if (*(afinfo->name) == 'n') -+ remove_proc_glob_entry(afinfo->name, NULL); -+ else -+ proc_net_remove(afinfo->name); - memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); - } - -@@ -1529,7 +1554,7 @@ static int udp4_seq_show(struct seq_file - static struct file_operations udp4_seq_fops; - static struct udp_seq_afinfo udp4_seq_afinfo = { - .owner = THIS_MODULE, -- .name = "udp", -+ .name = "net/udp", - .family = AF_INET, - .seq_show = udp4_seq_show, - .seq_fops = &udp4_seq_fops, -diff -upr linux-2.6.16.orig/net/ipv6/addrconf.c linux-2.6.16-026test009/net/ipv6/addrconf.c ---- linux-2.6.16.orig/net/ipv6/addrconf.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv6/addrconf.c 2006-04-19 15:02:12.000000000 +0400 -@@ -2153,6 +2153,10 @@ static int addrconf_notify(struct notifi - struct inet6_dev *idev = __in6_dev_get(dev); - int run_pending = 0; - -+ /* not virtualized yet */ -+ if (!ve_is_super(get_exec_env())) -+ return NOTIFY_OK; -+ - switch(event) { - case NETDEV_UP: - case NETDEV_CHANGE: -diff -upr linux-2.6.16.orig/net/ipv6/inet6_hashtables.c linux-2.6.16-026test009/net/ipv6/inet6_hashtables.c ---- linux-2.6.16.orig/net/ipv6/inet6_hashtables.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv6/inet6_hashtables.c 2006-04-19 15:02:12.000000000 +0400 -@@ -33,7 +33,7 @@ struct sock *inet6_lookup_listener(struc - int score, hiscore = 0; - - read_lock(&hashinfo->lhash_lock); -- sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { -+ sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum, 0)]) { - if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { - const struct ipv6_pinfo *np = inet6_sk(sk); - -@@ -173,7 +173,9 @@ int inet6_hash_connect(struct inet_timew - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; -+ struct ve_struct *ve; - -+ ve = VE_OWNER_SK(sk); - if (snum == 0) { - const int low = sysctl_local_port_range[0]; - const int high = sysctl_local_port_range[1]; -@@ -187,7 +189,8 @@ int inet6_hash_connect(struct inet_timew - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; -- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; -+ head = &hinfo->bhash[inet_bhashfn(port, -+ hinfo->bhash_size, VEID(ve))]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, -@@ -208,7 +211,7 @@ int inet6_hash_connect(struct inet_timew - } - - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, -- head, port); -+ head, port, ve); - if (!tb) { - spin_unlock(&head->lock); - break; -@@ -243,7 +246,7 @@ ok: - goto out; - } - -- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; -+ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - -diff -upr linux-2.6.16.orig/net/ipv6/route.c linux-2.6.16-026test009/net/ipv6/route.c ---- linux-2.6.16.orig/net/ipv6/route.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv6/route.c 2006-04-19 15:02:12.000000000 +0400 -@@ -113,7 +113,6 @@ struct rt6_info ip6_null_entry = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, -- .dev = &loopback_dev, - .obsolete = -1, - .error = -ENETUNREACH, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, -@@ -2121,6 +2120,7 @@ void __init ip6_route_init(void) - #ifdef CONFIG_XFRM - xfrm6_init(); - #endif -+ ip6_null_entry.u.dst.dev = &loopback_dev; - } - - void ip6_route_cleanup(void) -diff -upr linux-2.6.16.orig/net/ipv6/udp.c linux-2.6.16-026test009/net/ipv6/udp.c ---- linux-2.6.16.orig/net/ipv6/udp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/ipv6/udp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -69,7 +69,9 @@ static int udp_v6_get_port(struct sock * - { - struct sock *sk2; - struct hlist_node *node; -+ struct ve_struct *env; - -+ env = VE_OWNER_SK(sk); - write_lock_bh(&udp_hash_lock); - if (snum == 0) { - int best_size_so_far, best, result, i; -@@ -83,7 +85,7 @@ static int udp_v6_get_port(struct sock * - int size; - struct hlist_head *list; - -- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; -+ list = &udp_hash[udp_hashfn(result, VEID(env))]; - if (hlist_empty(list)) { - if (result > sysctl_local_port_range[1]) - result = sysctl_local_port_range[0] + -@@ -105,7 +107,7 @@ static int udp_v6_get_port(struct sock * - result = sysctl_local_port_range[0] - + ((result - sysctl_local_port_range[0]) & - (UDP_HTABLE_SIZE - 1)); -- if (!udp_lport_inuse(result)) -+ if (!udp_lport_inuse(result, env)) - break; - } - if (i >= (1 << 16) / UDP_HTABLE_SIZE) -@@ -114,9 +116,10 @@ gotit: - udp_port_rover = snum = result; - } else { - sk_for_each(sk2, node, -- &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { -+ &udp_hash[udp_hashfn(snum, VEID(env))]) { - if (inet_sk(sk2)->num == snum && - sk2 != sk && -+ ve_accessible_strict(VE_OWNER_SK(sk2), env) && - (!sk2->sk_bound_dev_if || - !sk->sk_bound_dev_if || - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && -@@ -128,7 +131,7 @@ gotit: - - inet_sk(sk)->num = snum; - if (sk_unhashed(sk)) { -- sk_add_node(sk, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]); -+ sk_add_node(sk, &udp_hash[udp_hashfn(snum, VEID(env))]); - sock_prot_inc_use(sk->sk_prot); - } - write_unlock_bh(&udp_hash_lock); -diff -upr linux-2.6.16.orig/net/netfilter/core.c linux-2.6.16-026test009/net/netfilter/core.c ---- linux-2.6.16.orig/net/netfilter/core.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/core.c 2006-04-19 15:02:12.000000000 +0400 -@@ -32,16 +32,24 @@ - * of skbuffs queued for userspace, and not deregister a hook unless - * this is zero, but that sucks. Now, we simply check when the - * packets come back: if the hook is gone, the packet is discarded. */ -+static DEFINE_SPINLOCK(nf_hook_lock); -+ - struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; - EXPORT_SYMBOL(nf_hooks); --static DEFINE_SPINLOCK(nf_hook_lock); -+#ifdef CONFIG_VE_IPTABLES -+#define ve_nf_hooks \ -+ ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks)) -+#else -+#define ve_nf_hooks nf_hooks -+#endif -+ - - int nf_register_hook(struct nf_hook_ops *reg) - { - struct list_head *i; - - spin_lock_bh(&nf_hook_lock); -- list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { -+ list_for_each(i, &ve_nf_hooks[reg->pf][reg->hooknum]) { - if (reg->priority < ((struct nf_hook_ops *)i)->priority) - break; - } -@@ -53,6 +61,33 @@ int nf_register_hook(struct nf_hook_ops - } - EXPORT_SYMBOL(nf_register_hook); - -+int virt_nf_register_hook(struct nf_hook_ops *reg) -+{ -+ int ret = 0; -+ -+ if (!ve_is_super(get_exec_env())) { -+ struct nf_hook_ops *tmp; -+ ret = -ENOMEM; -+ tmp = kmalloc(sizeof(struct nf_hook_ops), GFP_KERNEL); -+ if (!tmp) -+ goto nomem; -+ memcpy(tmp, reg, sizeof(struct nf_hook_ops)); -+ reg = tmp; -+ } -+ -+ ret = nf_register_hook(reg); -+ if (ret) -+ goto out; -+ -+ return 0; -+out: -+ if (!ve_is_super(get_exec_env())) -+ kfree(reg); -+nomem: -+ return ret; -+} -+EXPORT_SYMBOL(virt_nf_register_hook); -+ - void nf_unregister_hook(struct nf_hook_ops *reg) - { - spin_lock_bh(&nf_hook_lock); -@@ -63,6 +98,29 @@ void nf_unregister_hook(struct nf_hook_o - } - EXPORT_SYMBOL(nf_unregister_hook); - -+int virt_nf_unregister_hook(struct nf_hook_ops *reg) -+{ -+ struct nf_hook_ops *i; -+ -+ spin_lock_bh(&nf_hook_lock); -+ list_for_each_entry(i, &ve_nf_hooks[reg->pf][reg->hooknum], list) { -+ if (reg->hook == i->hook) { -+ reg = i; -+ break; -+ } -+ } -+ spin_unlock_bh(&nf_hook_lock); -+ if (reg != i) -+ return -ENOENT; -+ -+ nf_unregister_hook(reg); -+ -+ if (!ve_is_super(get_exec_env())) -+ kfree(reg); -+ return 0; -+} -+EXPORT_SYMBOL(virt_nf_unregister_hook); -+ - unsigned int nf_iterate(struct list_head *head, - struct sk_buff **skb, - int hook, -@@ -120,9 +178,9 @@ int nf_hook_slow(int pf, unsigned int ho - /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); - -- elem = &nf_hooks[pf][hook]; -+ elem = &ve_nf_hooks[pf][hook]; - next_hook: -- verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, -+ verdict = nf_iterate(&ve_nf_hooks[pf][hook], pskb, hook, indev, - outdev, &elem, okfn, hook_thresh); - if (verdict == NF_ACCEPT || verdict == NF_STOP) { - ret = 1; -@@ -195,13 +253,54 @@ struct proc_dir_entry *proc_net_netfilte - EXPORT_SYMBOL(proc_net_netfilter); - #endif - --void __init netfilter_init(void) -+void init_nf_hooks(struct list_head (*nh)[NF_MAX_HOOKS]) - { - int i, h; - for (i = 0; i < NPROTO; i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) -- INIT_LIST_HEAD(&nf_hooks[i][h]); -+ INIT_LIST_HEAD(&ve_nf_hooks[i][h]); - } -+} -+ -+int init_netfilter(void) -+{ -+#ifdef CONFIG_VE_IPTABLES -+ struct ve_struct *envid; -+ -+ envid = get_exec_env(); -+ envid->_nf_hooks = kmalloc(sizeof(nf_hooks), GFP_KERNEL); -+ if (envid->_nf_hooks == NULL) -+ return -ENOMEM; -+ -+ /* FIXME: charge ubc */ -+ -+ init_nf_hooks(envid->_nf_hooks); -+ return 0; -+#else -+ init_nf_hooks(nf_hooks); -+ return 0; -+#endif -+} -+EXPORT_SYMBOL(init_netfilter); -+ -+#ifdef CONFIG_VE_IPTABLES -+void fini_netfilter(void) -+{ -+ struct ve_struct *envid; -+ -+ envid = get_exec_env(); -+ if (envid->_nf_hooks != NULL) -+ kfree(envid->_nf_hooks); -+ envid->_nf_hooks = NULL; -+ -+ /* FIXME: uncharge ubc */ -+} -+EXPORT_SYMBOL(fini_netfilter); -+#endif -+ -+void __init netfilter_init(void) -+{ -+ init_netfilter(); - - #ifdef CONFIG_PROC_FS - proc_net_netfilter = proc_mkdir("netfilter", proc_net); -@@ -214,3 +313,4 @@ void __init netfilter_init(void) - if (netfilter_log_init() < 0) - panic("cannot initialize nf_log"); - } -+ -diff -upr linux-2.6.16.orig/net/netfilter/nf_conntrack_netlink.c linux-2.6.16-026test009/net/netfilter/nf_conntrack_netlink.c ---- linux-2.6.16.orig/net/netfilter/nf_conntrack_netlink.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/nf_conntrack_netlink.c 2006-04-19 15:02:11.000000000 +0400 -@@ -1641,7 +1641,7 @@ static void __exit ctnetlink_exit(void) - printk("ctnetlink: unregistering from nfnetlink.\n"); - - #ifdef CONFIG_NF_CONNTRACK_EVENTS -- nf_conntrack_unregister_notifier(&ctnl_notifier_exp); -+ nf_conntrack_expect_unregister_notifier(&ctnl_notifier_exp); - nf_conntrack_unregister_notifier(&ctnl_notifier); - #endif - -diff -upr linux-2.6.16.orig/net/netfilter/nf_queue.c linux-2.6.16-026test009/net/netfilter/nf_queue.c ---- linux-2.6.16.orig/net/netfilter/nf_queue.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/nf_queue.c 2006-04-19 15:02:12.000000000 +0400 -@@ -209,12 +209,12 @@ void nf_reinject(struct sk_buff *skb, st - /* Drop reference to owner of hook which queued us. */ - module_put(info->elem->owner); - -- list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { -+ list_for_each_rcu(i, &ve_nf_hooks[info->pf][info->hook]) { - if (i == elem) - break; - } - -- if (i == &nf_hooks[info->pf][info->hook]) { -+ if (i == &ve_nf_hooks[info->pf][info->hook]) { - /* The module which sent it to userspace is gone. */ - NFDEBUG("%s: module disappeared, dropping packet.\n", - __FUNCTION__); -@@ -235,7 +235,7 @@ void nf_reinject(struct sk_buff *skb, st - - if (verdict == NF_ACCEPT) { - next_hook: -- verdict = nf_iterate(&nf_hooks[info->pf][info->hook], -+ verdict = nf_iterate(&ve_nf_hooks[info->pf][info->hook], - &skb, info->hook, - info->indev, info->outdev, &elem, - info->okfn, INT_MIN); -diff -upr linux-2.6.16.orig/net/netfilter/nf_sockopt.c linux-2.6.16-026test009/net/netfilter/nf_sockopt.c ---- linux-2.6.16.orig/net/netfilter/nf_sockopt.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/nf_sockopt.c 2006-04-19 15:02:12.000000000 +0400 -@@ -80,6 +80,12 @@ static int nf_sockopt(struct sock *sk, i - struct nf_sockopt_ops *ops; - int ret; - -+#ifdef CONFIG_VE_IPTABLES -+ if (!get_exec_env()->_nf_hooks || -+ !get_exec_env()->_ipt_standard_target) -+ return -ENOPROTOOPT; -+#endif -+ - if (down_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; - -diff -upr linux-2.6.16.orig/net/netfilter/x_tables.c linux-2.6.16-026test009/net/netfilter/x_tables.c ---- linux-2.6.16.orig/net/netfilter/x_tables.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/x_tables.c 2006-04-19 15:02:12.000000000 +0400 -@@ -24,6 +24,10 @@ - - #include <linux/netfilter/x_tables.h> - #include <linux/netfilter_arp.h> -+#include <linux/nfcalls.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_mem.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -@@ -38,7 +42,13 @@ struct xt_af { - struct list_head tables; - }; - -+#ifdef CONFIG_VE_IPTABLES -+/* include ve.h and define get_exec_env */ -+#include <linux/sched.h> -+#define xt (get_exec_env()->_xt) -+#else - static struct xt_af *xt; -+#endif - - #ifdef DEBUG_IP_FIREWALL_USER - #define duprintf(format, args...) printk(format , ## args) -@@ -52,17 +62,52 @@ enum { - MATCH, - }; - -+#ifdef CONFIG_USER_RESOURCE -+#define UB_NUMXTENT 23 -+static int charge_xtables(struct user_beancounter *ub, unsigned long size) -+{ -+ if (ub == NULL) -+ return 0; -+ return charge_beancounter(ub, UB_NUMXTENT, size, 1); -+} -+static void uncharge_xtables(struct user_beancounter *ub, unsigned long size) -+{ -+ if (ub == NULL) -+ return; -+ uncharge_beancounter(ub, UB_NUMXTENT, size); -+} -+#endif /* CONFIG_USER_RESOURCE */ -+ - /* Registration hooks for targets. */ - int - xt_register_target(int af, struct xt_target *target) - { - int ret; -+ struct module *mod = target->me; -+ -+ if (!ve_is_super(get_exec_env())) { -+ struct xt_target *tmp; -+ __module_get(mod); -+ ret = -ENOMEM; -+ tmp = ub_kmalloc(sizeof(struct xt_target), GFP_KERNEL); -+ if (!tmp) -+ goto nomem; -+ memcpy(tmp, target, sizeof(struct xt_target)); -+ target = tmp; -+ } - - ret = down_interruptible(&xt[af].mutex); - if (ret != 0) -- return ret; -+ goto out; - list_add(&target->list, &xt[af].target); - up(&xt[af].mutex); -+ return 0; -+out: -+ if (!ve_is_super(get_exec_env())) { -+ kfree(target); -+nomem: -+ module_put(mod); -+ } - return ret; - } - EXPORT_SYMBOL(xt_register_target); -@@ -71,8 +116,21 @@ void - xt_unregister_target(int af, struct xt_target *target) - { - down(&xt[af].mutex); -+ if (!ve_is_super(get_exec_env())) { -+ target = list_named_find(&xt[af].target, target->name); -+ if (!target) { -+ up(&xt[af].mutex); -+ return; -+ } -+ } -+ - LIST_DELETE(&xt[af].target, target); - up(&xt[af].mutex); -+ -+ if (!ve_is_super(get_exec_env())) { -+ module_put(target->me); -+ kfree(target); -+ } - } - EXPORT_SYMBOL(xt_unregister_target); - -@@ -80,14 +138,33 @@ int - xt_register_match(int af, struct xt_match *match) - { - int ret; -+ struct module *mod = match->me; -+ -+ if (!ve_is_super(get_exec_env())) { -+ struct xt_match *tmp; -+ __module_get(mod); -+ ret = -ENOMEM; -+ tmp = ub_kmalloc(sizeof(struct xt_match), GFP_KERNEL); -+ if (!tmp) -+ goto nomem; -+ memcpy(tmp, match, sizeof(struct xt_match)); -+ match = tmp; -+ } - - ret = down_interruptible(&xt[af].mutex); - if (ret != 0) -- return ret; -+ goto out; - - list_add(&match->list, &xt[af].match); - up(&xt[af].mutex); - -+ return 0; -+out: -+ if (!ve_is_super(get_exec_env())) { -+ kfree(match); -+nomem: -+ module_put(mod); -+ } - return ret; - } - EXPORT_SYMBOL(xt_register_match); -@@ -96,8 +173,21 @@ void - xt_unregister_match(int af, struct xt_match *match) - { - down(&xt[af].mutex); -+ if (!ve_is_super(get_exec_env())) { -+ match = list_named_find(&xt[af].match, match->name); -+ if (!match) { -+ up(&xt[af].mutex); -+ return; -+ } -+ } -+ - LIST_DELETE(&xt[af].match, match); - up(&xt[af].mutex); -+ -+ if (!ve_is_super(get_exec_env())) { -+ module_put(match->me); -+ kfree(match); -+ } - } - EXPORT_SYMBOL(xt_unregister_match); - -@@ -246,7 +336,7 @@ struct xt_table_info *xt_alloc_table_inf - if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) - return NULL; - -- newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL); -+ newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL_UBC); - if (!newinfo) - return NULL; - -@@ -255,10 +345,10 @@ struct xt_table_info *xt_alloc_table_inf - for_each_cpu(cpu) { - if (size <= PAGE_SIZE) - newinfo->entries[cpu] = kmalloc_node(size, -- GFP_KERNEL, -+ GFP_KERNEL_UBC, - cpu_to_node(cpu)); - else -- newinfo->entries[cpu] = vmalloc_node(size, -+ newinfo->entries[cpu] = ub_vmalloc_node(size, - cpu_to_node(cpu)); - - if (newinfo->entries[cpu] == NULL) { -@@ -315,6 +405,9 @@ xt_replace_table(struct xt_table *table, - int *error) - { - struct xt_table_info *oldinfo, *private; -+#ifdef CONFIG_USER_RESOURCE -+ struct user_beancounter *old_ub, *new_ub; -+#endif - - /* Do the substitution. */ - write_lock_bh(&table->lock); -@@ -328,6 +421,21 @@ xt_replace_table(struct xt_table *table, - return NULL; - } - oldinfo = private; -+ -+#ifdef CONFIG_USER_RESOURCE -+ new_ub = mem_ub(newinfo); -+ if (charge_xtables(new_ub, newinfo->number)) { -+ oldinfo = NULL; -+ write_unlock_bh(&table->lock); -+ *error = -ENOMEM; -+ return NULL; -+ } -+ if (num_counters) { -+ old_ub = mem_ub(oldinfo); -+ uncharge_xtables(old_ub, oldinfo->number); -+ } -+#endif -+ - table->private = newinfo; - newinfo->initial_entries = oldinfo->initial_entries; - write_unlock_bh(&table->lock); -@@ -355,6 +463,7 @@ int xt_register_table(struct xt_table *t - - /* Simplifies replace_table code. */ - table->private = bootstrap; -+ rwlock_init(&table->lock); - if (!xt_replace_table(table, 0, newinfo, &ret)) - goto unlock; - -@@ -364,7 +473,6 @@ int xt_register_table(struct xt_table *t - /* save number of initial entries */ - private->initial_entries = private->number; - -- rwlock_init(&table->lock); - list_prepend(&xt[table->af].tables, table); - - ret = 0; -@@ -374,6 +482,39 @@ int xt_register_table(struct xt_table *t - } - EXPORT_SYMBOL_GPL(xt_register_table); - -+struct xt_table * virt_xt_register_table(struct xt_table *table, -+ struct xt_table_info *bootstrap, -+ struct xt_table_info *newinfo) -+{ -+ int ret; -+ struct module *mod = table->me; -+ -+ if (!ve_is_super(get_exec_env())) { -+ struct xt_table *tmp; -+ __module_get(mod); -+ ret = -ENOMEM; -+ tmp = ub_kmalloc(sizeof(struct xt_table), GFP_KERNEL); -+ if (!tmp) -+ goto nomem; -+ memcpy(tmp, table, sizeof(struct xt_table)); -+ table = tmp; -+ } -+ -+ ret = xt_register_table(table, bootstrap, newinfo); -+ if (ret) -+ goto out; -+ -+ return table; -+out: -+ if (!ve_is_super(get_exec_env())) { -+ kfree(table); -+nomem: -+ module_put(mod); -+ } -+ return ERR_PTR(ret); -+} -+EXPORT_SYMBOL_GPL(virt_xt_register_table); -+ - void *xt_unregister_table(struct xt_table *table) - { - struct xt_table_info *private; -@@ -383,10 +524,27 @@ void *xt_unregister_table(struct xt_tabl - LIST_DELETE(&xt[table->af].tables, table); - up(&xt[table->af].mutex); - -+#ifdef CONFIG_USER_RESOURCE -+ uncharge_xtables(mem_ub(private), private->number); -+#endif -+ - return private; - } - EXPORT_SYMBOL_GPL(xt_unregister_table); - -+void *virt_xt_unregister_table(struct xt_table *table) -+{ -+ void *ret; -+ -+ ret = xt_unregister_table(table); -+ if (!ve_is_super(get_exec_env())) { -+ module_put(table->me); -+ kfree(table); -+ } -+ return ret; -+} -+EXPORT_SYMBOL_GPL(virt_xt_unregister_table); -+ - #ifdef CONFIG_PROC_FS - static char *xt_proto_prefix[NPROTO] = { - [AF_INET] = "ip", -@@ -597,10 +755,13 @@ void xt_proto_fini(int af) - EXPORT_SYMBOL_GPL(xt_proto_fini); - - --static int __init xt_init(void) -+int init_xtables(void) - { - int i; - -+ if (xt) -+ return -EEXIST; -+ - xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL); - if (!xt) - return -ENOMEM; -@@ -614,11 +775,34 @@ static int __init xt_init(void) - return 0; - } - --static void __exit xt_fini(void) -+void fini_xtables(void) - { - kfree(xt); -+ xt = NULL; -+} -+ -+static int __init xt_init(void) -+{ -+ int err; -+ -+ err = init_xtables(); -+ if (err) -+ return err; -+ -+ KSYMRESOLVE(init_xtables); -+ KSYMRESOLVE(fini_xtables); -+ KSYMMODRESOLVE(x_tables); -+ return 0; -+} -+ -+static void __exit xt_fini(void) -+{ -+ KSYMMODUNRESOLVE(x_tables); -+ KSYMUNRESOLVE(init_xtables); -+ KSYMUNRESOLVE(fini_xtables); -+ fini_xtables(); - } - --module_init(xt_init); -+subsys_initcall(xt_init); - module_exit(xt_fini); - -diff -upr linux-2.6.16.orig/net/netfilter/xt_conntrack.c linux-2.6.16-026test009/net/netfilter/xt_conntrack.c ---- linux-2.6.16.orig/net/netfilter/xt_conntrack.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/xt_conntrack.c 2006-04-19 15:02:12.000000000 +0400 -@@ -20,6 +20,8 @@ - - #include <linux/netfilter/x_tables.h> - #include <linux/netfilter/xt_conntrack.h> -+#include <linux/netfilter_ipv4/ip_tables.h> -+#include <linux/nfcalls.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -@@ -213,25 +215,145 @@ static int check(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat_to_user(void *match, void **dstptr, -+ int *size, int off) -+{ -+ struct ipt_entry_match *pm; -+ struct xt_conntrack_info *pinfo; -+ struct compat_xt_conntrack_info info; -+ u_int16_t msize; -+ -+ pm = (struct ipt_entry_match *)match; -+ msize = pm->u.user.match_size; -+ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match))) -+ return -EFAULT; -+ pinfo = (struct xt_conntrack_info *)pm->data; -+ memset(&info, 0, sizeof(struct compat_xt_conntrack_info)); -+ info.statemask = pinfo->statemask; -+ info.statusmask = pinfo->statusmask; -+ memcpy(info.tuple, pinfo->tuple, IP_CT_DIR_MAX * -+ sizeof(struct ip_conntrack_tuple)); -+ memcpy(info.sipmsk, pinfo->sipmsk, -+ IP_CT_DIR_MAX * sizeof(struct in_addr)); -+ memcpy(info.dipmsk, pinfo->dipmsk, -+ IP_CT_DIR_MAX * sizeof(struct in_addr)); -+ info.expires_min = pinfo->expires_min; -+ info.expires_max = pinfo->expires_max; -+ info.flags = pinfo->flags; -+ info.invflags = pinfo->invflags; -+ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match), -+ &info, sizeof(struct compat_xt_conntrack_info))) -+ return -EFAULT; -+ msize -= off; -+ if (put_user(msize, (u_int16_t *)*dstptr)) -+ return -EFAULT; -+ *size -= off; -+ *dstptr += msize; -+ return 0; -+} -+ -+static int compat_from_user(void *match, void **dstptr, -+ int *size, int off) -+{ -+ struct compat_ipt_entry_match *pm; -+ struct ipt_entry_match *dstpm; -+ struct compat_xt_conntrack_info *pinfo; -+ struct xt_conntrack_info info; -+ u_int16_t msize; -+ -+ pm = (struct compat_ipt_entry_match *)match; -+ dstpm = (struct ipt_entry_match *)*dstptr; -+ msize = pm->u.user.match_size; -+ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match)); -+ pinfo = (struct compat_xt_conntrack_info *)pm->data; -+ memset(&info, 0, sizeof(struct xt_conntrack_info)); -+ info.statemask = pinfo->statemask; -+ info.statusmask = pinfo->statusmask; -+ memcpy(info.tuple, pinfo->tuple, IP_CT_DIR_MAX * -+ sizeof(struct ip_conntrack_tuple)); -+ memcpy(info.sipmsk, pinfo->sipmsk, -+ IP_CT_DIR_MAX * sizeof(struct in_addr)); -+ memcpy(info.dipmsk, pinfo->dipmsk, -+ IP_CT_DIR_MAX * sizeof(struct in_addr)); -+ info.expires_min = pinfo->expires_min; -+ info.expires_max = pinfo->expires_max; -+ info.flags = pinfo->flags; -+ info.invflags = pinfo->invflags; -+ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match), -+ &info, sizeof(struct xt_conntrack_info)); -+ msize += off; -+ dstpm->u.user.match_size = msize; -+ *size += off; -+ *dstptr += msize; -+ return 0; -+} -+ -+static int compat(void *match, void **dstptr, int *size, int convert) -+{ -+ int ret, off; -+ -+ off = XT_ALIGN(sizeof(struct xt_conntrack_info)) - -+ COMPAT_XT_ALIGN(sizeof(struct compat_xt_conntrack_info)); -+ switch (convert) { -+ case COMPAT_TO_USER: -+ ret = compat_to_user(match, dstptr, size, off); -+ break; -+ case COMPAT_FROM_USER: -+ ret = compat_from_user(match, dstptr, size, off); -+ break; -+ case COMPAT_CALC_SIZE: -+ *size += off; -+ ret = 0; -+ break; -+ default: -+ ret = -ENOPROTOOPT; -+ break; -+ } -+ return ret; -+} -+#endif -+ - static struct xt_match conntrack_match = { - .name = "conntrack", - .match = &match, - .checkentry = &check, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - -+int init_xt_conntrack_match(void) -+{ -+ return xt_register_match(AF_INET, &conntrack_match); -+} -+ -+void fini_xt_conntrack_match(void) -+{ -+ xt_unregister_match(AF_INET, &conntrack_match); -+} -+ - static int __init init(void) - { - int ret; - need_conntrack(); -- ret = xt_register_match(AF_INET, &conntrack_match); -- -+ ret = init_xt_conntrack_match(); -+ if (ret < 0) -+ return ret; -+ -+ KSYMRESOLVE(init_xt_conntrack_match); -+ KSYMRESOLVE(fini_xt_conntrack_match); -+ KSYMMODRESOLVE(xt_conntrack); - return ret; - } - - static void __exit fini(void) - { -- xt_unregister_match(AF_INET, &conntrack_match); -+ KSYMMODUNRESOLVE(xt_conntrack); -+ KSYMUNRESOLVE(init_xt_conntrack_match); -+ KSYMUNRESOLVE(fini_xt_conntrack_match); -+ fini_xt_conntrack_match(); - } - - module_init(init); -diff -upr linux-2.6.16.orig/net/netfilter/xt_helper.c linux-2.6.16-026test009/net/netfilter/xt_helper.c ---- linux-2.6.16.orig/net/netfilter/xt_helper.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/xt_helper.c 2006-04-19 15:02:12.000000000 +0400 -@@ -24,6 +24,8 @@ - #endif - #include <linux/netfilter/x_tables.h> - #include <linux/netfilter/xt_helper.h> -+#include <linux/netfilter_ipv4/ip_tables.h> -+#include <linux/nfcalls.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); -@@ -148,23 +150,107 @@ static int check(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat_to_user(void *match, void **dstptr, -+ int *size, int off) -+{ -+ struct ipt_entry_match *pm; -+ struct xt_helper_info *pinfo; -+ struct compat_xt_helper_info info; -+ u_int16_t msize; -+ -+ pm = (struct ipt_entry_match *)match; -+ msize = pm->u.user.match_size; -+ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match))) -+ return -EFAULT; -+ pinfo = (struct xt_helper_info *)pm->data; -+ memset(&info, 0, sizeof(struct compat_xt_helper_info)); -+ info.invert = pinfo->invert; -+ memcpy(info.name, pinfo->name, 30); -+ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match), -+ &info, sizeof(struct compat_xt_helper_info))) -+ return -EFAULT; -+ msize -= off; -+ if (put_user(msize, (u_int16_t *)*dstptr)) -+ return -EFAULT; -+ *size -= off; -+ *dstptr += msize; -+ return 0; -+} -+ -+static int compat_from_user(void *match, void **dstptr, -+ int *size, int off) -+{ -+ struct compat_ipt_entry_match *pm; -+ struct ipt_entry_match *dstpm; -+ struct compat_xt_helper_info *pinfo; -+ struct xt_helper_info info; -+ u_int16_t msize; -+ -+ pm = (struct compat_ipt_entry_match *)match; -+ dstpm = (struct ipt_entry_match *)*dstptr; -+ msize = pm->u.user.match_size; -+ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match)); -+ pinfo = (struct compat_xt_helper_info *)pm->data; -+ memset(&info, 0, sizeof(struct xt_helper_info)); -+ info.invert = pinfo->invert; -+ memcpy(info.name, pinfo->name, 30); -+ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match), -+ &info, sizeof(struct xt_helper_info)); -+ msize += off; -+ dstpm->u.user.match_size = msize; -+ *size += off; -+ *dstptr += msize; -+ return 0; -+} -+ -+static int compat(void *match, void **dstptr, int *size, int convert) -+{ -+ int ret, off; -+ -+ off = XT_ALIGN(sizeof(struct xt_helper_info)) - -+ COMPAT_XT_ALIGN(sizeof(struct compat_xt_helper_info)); -+ switch (convert) { -+ case COMPAT_TO_USER: -+ ret = compat_to_user(match, dstptr, size, off); -+ break; -+ case COMPAT_FROM_USER: -+ ret = compat_from_user(match, dstptr, size, off); -+ break; -+ case COMPAT_CALC_SIZE: -+ *size += off; -+ ret = 0; -+ break; -+ default: -+ ret = -ENOPROTOOPT; -+ break; -+ } -+ return ret; -+} -+#endif -+ - static struct xt_match helper_match = { - .name = "helper", - .match = &match, - .checkentry = &check, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - static struct xt_match helper6_match = { - .name = "helper", - .match = &match, - .checkentry = &check, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_xt_helper(void) - { - int ret; -- need_conntrack(); - - ret = xt_register_match(AF_INET, &helper_match); - if (ret < 0) -@@ -177,12 +263,35 @@ static int __init init(void) - return ret; - } - --static void __exit fini(void) -+void fini_xt_helper(void) - { - xt_unregister_match(AF_INET, &helper_match); - xt_unregister_match(AF_INET6, &helper6_match); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ need_conntrack(); -+ err = init_xt_helper(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_xt_helper); -+ KSYMRESOLVE(fini_xt_helper); -+ KSYMMODRESOLVE(xt_helper); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(xt_helper); -+ KSYMUNRESOLVE(init_xt_helper); -+ KSYMUNRESOLVE(fini_xt_helper); -+ fini_xt_helper(); -+} -+ - module_init(init); - module_exit(fini); - -diff -upr linux-2.6.16.orig/net/netfilter/xt_length.c linux-2.6.16-026test009/net/netfilter/xt_length.c ---- linux-2.6.16.orig/net/netfilter/xt_length.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/xt_length.c 2006-04-19 15:02:12.000000000 +0400 -@@ -13,6 +13,7 @@ - - #include <linux/netfilter/xt_length.h> - #include <linux/netfilter/x_tables.h> -+#include <linux/nfcalls.h> - - MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); - MODULE_DESCRIPTION("IP tables packet length matching module"); -@@ -63,20 +64,38 @@ checkentry(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = XT_ALIGN(sizeof(struct xt_length_info)) - -+ COMPAT_XT_ALIGN(sizeof(struct xt_length_info)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+#endif -+ - static struct xt_match length_match = { - .name = "length", - .match = &match, - .checkentry = &checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - static struct xt_match length6_match = { - .name = "length", - .match = &match6, - .checkentry = &checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_xt_length(void) - { - int ret; - ret = xt_register_match(AF_INET, &length_match); -@@ -89,11 +108,33 @@ static int __init init(void) - return ret; - } - --static void __exit fini(void) -+void fini_xt_length(void) - { - xt_unregister_match(AF_INET, &length_match); - xt_unregister_match(AF_INET6, &length6_match); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_xt_length(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_xt_length); -+ KSYMRESOLVE(fini_xt_length); -+ KSYMMODRESOLVE(xt_length); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(xt_length); -+ KSYMUNRESOLVE(init_xt_length); -+ KSYMUNRESOLVE(fini_xt_length); -+ fini_xt_length(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/netfilter/xt_limit.c linux-2.6.16-026test009/net/netfilter/xt_limit.c ---- linux-2.6.16.orig/net/netfilter/xt_limit.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/xt_limit.c 2006-04-19 15:02:12.000000000 +0400 -@@ -17,9 +17,11 @@ - #include <linux/skbuff.h> - #include <linux/spinlock.h> - #include <linux/interrupt.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter/x_tables.h> - #include <linux/netfilter/xt_limit.h> -+#include <linux/netfilter_ipv4/ip_tables.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); -@@ -27,6 +29,13 @@ MODULE_DESCRIPTION("iptables rate limit - MODULE_ALIAS("ipt_limit"); - MODULE_ALIAS("ip6t_limit"); - -+#ifdef CONFIG_VE_IPTABLES -+#include <linux/sched.h> -+#define ve_ipt_limit_reg (*(get_exec_env()->_ipt_limit_reg)) -+#else -+#define ve_ipt_limit_reg ipt_limit_reg -+#endif -+ - /* The algorithm used is the Simple Token Bucket Filter (TBF) - * see net/sched/sch_tbf.c in the linux source tree - */ -@@ -137,20 +146,108 @@ ipt_limit_checkentry(const char *tablena - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int ipt_limit_compat_to_user(void *match, void **dstptr, -+ int *size, int off) -+{ -+ struct ipt_entry_match *pm; -+ struct xt_rateinfo *pinfo; -+ struct compat_xt_rateinfo rinfo; -+ u_int16_t msize; -+ -+ pm = (struct ipt_entry_match *)match; -+ msize = pm->u.user.match_size; -+ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match))) -+ return -EFAULT; -+ pinfo = (struct xt_rateinfo *)pm->data; -+ memset(&rinfo, 0, sizeof(struct compat_xt_rateinfo)); -+ rinfo.avg = pinfo->avg; -+ rinfo.burst = pinfo->burst; -+ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match), -+ &rinfo, sizeof(struct compat_xt_rateinfo))) -+ return -EFAULT; -+ msize -= off; -+ if (put_user(msize, (u_int16_t *)*dstptr)) -+ return -EFAULT; -+ *size -= off; -+ *dstptr += msize; -+ return 0; -+} -+ -+static int ipt_limit_compat_from_user(void *match, void **dstptr, -+ int *size, int off) -+{ -+ struct compat_ipt_entry_match *pm; -+ struct ipt_entry_match *dstpm; -+ struct compat_xt_rateinfo *pinfo; -+ struct xt_rateinfo rinfo; -+ u_int16_t msize; -+ -+ pm = (struct compat_ipt_entry_match *)match; -+ dstpm = (struct ipt_entry_match *)*dstptr; -+ msize = pm->u.user.match_size; -+ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match)); -+ pinfo = (struct compat_xt_rateinfo *)pm->data; -+ memset(&rinfo, 0, sizeof(struct xt_rateinfo)); -+ rinfo.avg = pinfo->avg; -+ rinfo.burst = pinfo->burst; -+ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match), -+ &rinfo, sizeof(struct xt_rateinfo)); -+ msize += off; -+ dstpm->u.user.match_size = msize; -+ *size += off; -+ *dstptr += msize; -+ return 0; -+} -+ -+static int ipt_limit_compat(void *match, void **dstptr, -+ int *size, int convert) -+{ -+ int ret, off; -+ -+ off = XT_ALIGN(sizeof(struct xt_rateinfo)) - -+ COMPAT_XT_ALIGN(sizeof(struct compat_xt_rateinfo)); -+ switch (convert) { -+ case COMPAT_TO_USER: -+ ret = ipt_limit_compat_to_user(match, -+ dstptr, size, off); -+ break; -+ case COMPAT_FROM_USER: -+ ret = ipt_limit_compat_from_user(match, -+ dstptr, size, off); -+ break; -+ case COMPAT_CALC_SIZE: -+ *size += off; -+ ret = 0; -+ break; -+ default: -+ ret = -ENOPROTOOPT; -+ break; -+ } -+ return ret; -+} -+#endif -+ - static struct xt_match ipt_limit_reg = { - .name = "limit", - .match = ipt_limit_match, - .checkentry = ipt_limit_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = ipt_limit_compat, -+#endif - .me = THIS_MODULE, - }; - static struct xt_match limit6_reg = { - .name = "limit", - .match = ipt_limit_match, - .checkentry = ipt_limit_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = ipt_limit_compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_xt_limit(void) - { - int ret; - -@@ -165,11 +262,33 @@ static int __init init(void) - return ret; - } - --static void __exit fini(void) -+void fini_xt_limit(void) - { - xt_unregister_match(AF_INET, &ipt_limit_reg); - xt_unregister_match(AF_INET6, &limit6_reg); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_xt_limit(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_xt_limit); -+ KSYMRESOLVE(fini_xt_limit); -+ KSYMMODRESOLVE(xt_limit); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(xt_limit); -+ KSYMUNRESOLVE(init_xt_limit); -+ KSYMUNRESOLVE(fini_xt_limit); -+ fini_xt_limit(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/netfilter/xt_state.c linux-2.6.16-026test009/net/netfilter/xt_state.c ---- linux-2.6.16.orig/net/netfilter/xt_state.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/xt_state.c 2006-04-19 15:02:12.000000000 +0400 -@@ -10,9 +10,11 @@ - - #include <linux/module.h> - #include <linux/skbuff.h> -+#include <linux/nfcalls.h> - #include <net/netfilter/nf_conntrack_compat.h> - #include <linux/netfilter/x_tables.h> - #include <linux/netfilter/xt_state.h> -+#include <linux/netfilter_ipv4/ip_tables.h> - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -@@ -55,10 +57,90 @@ static int check(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat_to_user(void *match, void **dstptr, -+ int *size, int off) -+{ -+ struct ipt_entry_match *pm; -+ struct xt_state_info *pinfo; -+ struct compat_xt_state_info info; -+ u_int16_t msize; -+ -+ pm = (struct ipt_entry_match *)match; -+ msize = pm->u.user.match_size; -+ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match))) -+ return -EFAULT; -+ pinfo = (struct xt_state_info *)pm->data; -+ memset(&info, 0, sizeof(struct compat_xt_state_info)); -+ info.statemask = pinfo->statemask; -+ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match), -+ &info, sizeof(struct compat_xt_state_info))) -+ return -EFAULT; -+ msize -= off; -+ if (put_user(msize, (u_int16_t *)*dstptr)) -+ return -EFAULT; -+ *size -= off; -+ *dstptr += msize; -+ return 0; -+} -+ -+static int compat_from_user(void *match, void **dstptr, -+ int *size, int off) -+{ -+ struct compat_ipt_entry_match *pm; -+ struct ipt_entry_match *dstpm; -+ struct compat_xt_state_info *pinfo; -+ struct xt_state_info info; -+ u_int16_t msize; -+ -+ pm = (struct compat_ipt_entry_match *)match; -+ dstpm = (struct ipt_entry_match *)*dstptr; -+ msize = pm->u.user.match_size; -+ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match)); -+ pinfo = (struct compat_xt_state_info *)pm->data; -+ memset(&info, 0, sizeof(struct xt_state_info)); -+ info.statemask = pinfo->statemask; -+ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match), -+ &info, sizeof(struct xt_state_info)); -+ msize += off; -+ dstpm->u.user.match_size = msize; -+ *size += off; -+ *dstptr += msize; -+ return 0; -+} -+ -+static int compat(void *match, void **dstptr, int *size, int convert) -+{ -+ int ret, off; -+ -+ off = XT_ALIGN(sizeof(struct xt_state_info)) - -+ COMPAT_XT_ALIGN(sizeof(struct compat_xt_state_info)); -+ switch (convert) { -+ case COMPAT_TO_USER: -+ ret = compat_to_user(match, dstptr, size, off); -+ break; -+ case COMPAT_FROM_USER: -+ ret = compat_from_user(match, dstptr, size, off); -+ break; -+ case COMPAT_CALC_SIZE: -+ *size += off; -+ ret = 0; -+ break; -+ default: -+ ret = -ENOPROTOOPT; -+ break; -+ } -+ return ret; -+} -+#endif -+ - static struct xt_match state_match = { - .name = "state", - .match = &match, - .checkentry = &check, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - -@@ -66,15 +148,16 @@ static struct xt_match state6_match = { - .name = "state", - .match = &match, - .checkentry = &check, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_xt_state(void) - { - int ret; - -- need_conntrack(); -- - ret = xt_register_match(AF_INET, &state_match); - if (ret < 0) - return ret; -@@ -86,11 +169,34 @@ static int __init init(void) - return ret; - } - --static void __exit fini(void) -+void fini_xt_state(void) - { - xt_unregister_match(AF_INET, &state_match); - xt_unregister_match(AF_INET6, &state6_match); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ need_conntrack(); -+ err = init_xt_state(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_xt_state); -+ KSYMRESOLVE(fini_xt_state); -+ KSYMMODRESOLVE(xt_state); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(xt_state); -+ KSYMUNRESOLVE(init_xt_state); -+ KSYMUNRESOLVE(fini_xt_state); -+ fini_xt_state(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/netfilter/xt_tcpmss.c linux-2.6.16-026test009/net/netfilter/xt_tcpmss.c ---- linux-2.6.16.orig/net/netfilter/xt_tcpmss.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/xt_tcpmss.c 2006-04-19 15:02:12.000000000 +0400 -@@ -11,6 +11,7 @@ - #include <linux/module.h> - #include <linux/skbuff.h> - #include <net/tcp.h> -+#include <linux/nfcalls.h> - - #include <linux/netfilter/xt_tcpmss.h> - #include <linux/netfilter/x_tables.h> -@@ -133,10 +134,25 @@ checkentry6(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int compat(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = XT_ALIGN(sizeof(struct xt_tcpmss_match_info)) - -+ COMPAT_XT_ALIGN(sizeof(struct xt_tcpmss_match_info)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+#endif -+ - static struct xt_match tcpmss_match = { - .name = "tcpmss", - .match = &match, - .checkentry = &checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - -@@ -144,11 +160,14 @@ static struct xt_match tcpmss6_match = { - .name = "tcpmss", - .match = &match, - .checkentry = &checkentry6, -+#ifdef CONFIG_COMPAT -+ .compat = &compat, -+#endif - .me = THIS_MODULE, - }; - - --static int __init init(void) -+int init_xt_tcpmss(void) - { - int ret; - ret = xt_register_match(AF_INET, &tcpmss_match); -@@ -162,11 +181,33 @@ static int __init init(void) - return ret; - } - --static void __exit fini(void) -+void fini_xt_tcpmss(void) - { - xt_unregister_match(AF_INET6, &tcpmss6_match); - xt_unregister_match(AF_INET, &tcpmss_match); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_xt_tcpmss(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_xt_tcpmss); -+ KSYMRESOLVE(fini_xt_tcpmss); -+ KSYMMODRESOLVE(xt_tcpmss); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(xt_tcpmss); -+ KSYMUNRESOLVE(init_xt_tcpmss); -+ KSYMUNRESOLVE(fini_xt_tcpmss); -+ fini_xt_tcpmss(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/netfilter/xt_tcpudp.c linux-2.6.16-026test009/net/netfilter/xt_tcpudp.c ---- linux-2.6.16.orig/net/netfilter/xt_tcpudp.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netfilter/xt_tcpudp.c 2006-04-19 15:02:12.000000000 +0400 -@@ -5,6 +5,7 @@ - #include <net/ipv6.h> - #include <net/tcp.h> - #include <net/udp.h> -+#include <linux/nfcalls.h> - #include <linux/netfilter/x_tables.h> - #include <linux/netfilter/xt_tcpudp.h> - #include <linux/netfilter_ipv4/ip_tables.h> -@@ -266,10 +267,35 @@ udp6_checkentry(const char *tablename, - return 1; - } - -+#ifdef CONFIG_COMPAT -+static int tcp_compat(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = XT_ALIGN(sizeof(struct xt_tcp)) - -+ COMPAT_XT_ALIGN(sizeof(struct xt_tcp)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+ -+static int udp_compat(void *match, -+ void **dstptr, int *size, int convert) -+{ -+ int off; -+ -+ off = XT_ALIGN(sizeof(struct xt_udp)) - -+ COMPAT_XT_ALIGN(sizeof(struct xt_udp)); -+ return ipt_match_align_compat(match, dstptr, size, off, convert); -+} -+#endif -+ - static struct xt_match tcp_matchstruct = { - .name = "tcp", - .match = &tcp_match, - .checkentry = &tcp_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &tcp_compat, -+#endif - .me = THIS_MODULE, - }; - static struct xt_match tcp6_matchstruct = { -@@ -283,6 +309,9 @@ static struct xt_match udp_matchstruct = - .name = "udp", - .match = &udp_match, - .checkentry = &udp_checkentry, -+#ifdef CONFIG_COMPAT -+ .compat = &udp_compat, -+#endif - .me = THIS_MODULE, - }; - static struct xt_match udp6_matchstruct = { -@@ -292,7 +321,7 @@ static struct xt_match udp6_matchstruct - .me = THIS_MODULE, - }; - --static int __init init(void) -+int init_xt_tcpudp(void) - { - int ret; - ret = xt_register_match(AF_INET, &tcp_matchstruct); -@@ -322,7 +351,7 @@ out_unreg_tcp: - return ret; - } - --static void __exit fini(void) -+void fini_xt_tcpudp(void) - { - xt_unregister_match(AF_INET6, &udp6_matchstruct); - xt_unregister_match(AF_INET, &udp_matchstruct); -@@ -330,5 +359,27 @@ static void __exit fini(void) - xt_unregister_match(AF_INET, &tcp_matchstruct); - } - -+static int __init init(void) -+{ -+ int err; -+ -+ err = init_xt_tcpudp(); -+ if (err < 0) -+ return err; -+ -+ KSYMRESOLVE(init_xt_tcpudp); -+ KSYMRESOLVE(fini_xt_tcpudp); -+ KSYMMODRESOLVE(xt_tcpudp); -+ return 0; -+} -+ -+static void __exit fini(void) -+{ -+ KSYMMODUNRESOLVE(xt_tcpudp); -+ KSYMUNRESOLVE(init_xt_tcpudp); -+ KSYMUNRESOLVE(fini_xt_tcpudp); -+ fini_xt_tcpudp(); -+} -+ - module_init(init); - module_exit(fini); -diff -upr linux-2.6.16.orig/net/netlink/af_netlink.c linux-2.6.16-026test009/net/netlink/af_netlink.c ---- linux-2.6.16.orig/net/netlink/af_netlink.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/netlink/af_netlink.c 2006-04-19 15:02:13.000000000 +0400 -@@ -60,27 +60,14 @@ - #include <net/sock.h> - #include <net/scm.h> - #include <net/netlink.h> -+#include <net/netlink_sock.h> -+ -+#include <ub/beancounter.h> -+#include <ub/ub_net.h> - - #define Nprintk(a...) - #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) - --struct netlink_sock { -- /* struct sock has to be the first member of netlink_sock */ -- struct sock sk; -- u32 pid; -- u32 dst_pid; -- u32 dst_group; -- u32 flags; -- u32 subscriptions; -- u32 ngroups; -- unsigned long *groups; -- unsigned long state; -- wait_queue_head_t wait; -- struct netlink_callback *cb; -- spinlock_t cb_lock; -- void (*data_ready)(struct sock *sk, int bytes); -- struct module *module; --}; - - #define NETLINK_KERNEL_SOCKET 0x1 - #define NETLINK_RECV_PKTINFO 0x2 -@@ -209,7 +196,10 @@ static __inline__ struct sock *netlink_l - read_lock(&nl_table_lock); - head = nl_pid_hashfn(hash, pid); - sk_for_each(sk, node, head) { -- if (nlk_sk(sk)->pid == pid) { -+ /* VEs should find sockets, created by kernel */ -+ if ((nlk_sk(sk)->pid == pid) && -+ (!pid || ve_accessible_strict(VE_OWNER_SK(sk), -+ get_exec_env()))){ - sock_hold(sk); - goto found; - } -@@ -309,7 +299,9 @@ static int netlink_insert(struct sock *s - head = nl_pid_hashfn(hash, pid); - len = 0; - sk_for_each(osk, node, head) { -- if (nlk_sk(osk)->pid == pid) -+ if ((nlk_sk(sk)->pid == pid) && -+ ve_accessible_strict(VE_OWNER_SK(sk), -+ get_exec_env())) - break; - len++; - } -@@ -362,6 +354,8 @@ static int __netlink_create(struct socke - sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); - if (!sk) - return -ENOMEM; -+ if (ub_other_sock_charge(sk)) -+ goto out_free; - - sock_init_data(sock, sk); - -@@ -372,6 +366,10 @@ static int __netlink_create(struct socke - sk->sk_destruct = netlink_sock_destruct; - sk->sk_protocol = protocol; - return 0; -+ -+out_free: -+ sk_free(sk); -+ return -ENOMEM; - } - - static int netlink_create(struct socket *sock, int protocol) -@@ -477,7 +475,7 @@ static int netlink_autobind(struct socke - struct hlist_head *head; - struct sock *osk; - struct hlist_node *node; -- s32 pid = current->tgid; -+ s32 pid = virt_pid(current); - int err; - static s32 rover = -4097; - -@@ -486,7 +484,9 @@ retry: - netlink_table_grab(); - head = nl_pid_hashfn(hash, pid); - sk_for_each(osk, node, head) { -- if (nlk_sk(osk)->pid == pid) { -+ if ((nlk_sk(osk)->pid == pid) && -+ ve_accessible_strict(VE_OWNER_SK(osk), -+ get_exec_env())) { - /* Bind collision, search negative pid values. */ - pid = rover--; - if (rover > -4097) -@@ -511,7 +511,7 @@ retry: - static inline int netlink_capable(struct socket *sock, unsigned int flag) - { - return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || -- capable(CAP_NET_ADMIN); -+ capable(CAP_VE_NET_ADMIN); - } - - static void -@@ -845,6 +845,9 @@ static inline int do_one_broadcast(struc - !test_bit(p->group - 1, nlk->groups)) - goto out; - -+ if (!ve_accessible_strict(get_exec_env(), VE_OWNER_SK(sk))) -+ goto out; -+ - if (p->failure) { - netlink_overrun(sk); - goto out; -@@ -942,6 +945,9 @@ static inline int do_one_set_err(struct - !test_bit(p->group - 1, nlk->groups)) - goto out; - -+ if (!ve_accessible_strict(get_exec_env(), VE_OWNER_SK(sk))) -+ goto out; -+ - sk->sk_err = p->code; - sk->sk_error_report(sk); - out: -@@ -1076,12 +1082,17 @@ static int netlink_sendmsg(struct kiocb - struct sock_iocb *siocb = kiocb_to_siocb(kiocb); - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); -- struct sockaddr_nl *addr=msg->msg_name; -+ struct sockaddr_nl *addr = msg->msg_name; - u32 dst_pid; -- u32 dst_group; - struct sk_buff *skb; - int err; - struct scm_cookie scm; -+ struct sock *dstsk; -+ long timeo; -+ int no_ubc, no_buf; -+ unsigned long chargesize; -+ -+ DECLARE_WAITQUEUE(wait, current); - - if (msg->msg_flags&MSG_OOB) - return -EOPNOTSUPP; -@@ -1092,17 +1103,16 @@ static int netlink_sendmsg(struct kiocb - if (err < 0) - return err; - -+ /* Broadcasts from user to kernel are disabled. This is OK -+ * according to ANK */ - if (msg->msg_namelen) { - if (addr->nl_family != AF_NETLINK) - return -EINVAL; - dst_pid = addr->nl_pid; -- dst_group = ffs(addr->nl_groups); -- if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) -+ if (addr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) - return -EPERM; -- } else { -+ } else - dst_pid = nlk->dst_pid; -- dst_group = nlk->dst_group; -- } - - if (!nlk->pid) { - err = netlink_autobind(sock); -@@ -1115,12 +1125,12 @@ static int netlink_sendmsg(struct kiocb - goto out; - err = -ENOBUFS; - skb = alloc_skb(len, GFP_KERNEL); -- if (skb==NULL) -+ if (skb == NULL) - goto out; - - NETLINK_CB(skb).pid = nlk->pid; - NETLINK_CB(skb).dst_pid = dst_pid; -- NETLINK_CB(skb).dst_group = dst_group; -+ NETLINK_CB(skb).dst_group = 0; - NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); - memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - -@@ -1131,25 +1141,88 @@ static int netlink_sendmsg(struct kiocb - */ - - err = -EFAULT; -- if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) { -- kfree_skb(skb); -- goto out; -- } -+ if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) -+ goto out_free; - - err = security_netlink_send(sk, skb); -- if (err) { -- kfree_skb(skb); -- goto out; -+ if (err) -+ goto out_free; -+ -+ timeo = sock_sndtimeo(sk, msg->msg_flags&MSG_DONTWAIT); -+retry: -+ dstsk = netlink_getsockbypid(sk, dst_pid); -+ if (IS_ERR(dstsk)) { -+ err = PTR_ERR(dstsk); -+ goto out_free; - } - -- if (dst_group) { -- atomic_inc(&skb->users); -- netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); -+ nlk = nlk_sk(dstsk); -+#ifdef NL_EMULATE_DEV -+ if (nlk->handler) { -+ skb_orphan(skb); -+ err = nlk->handler(protocol, skb); -+ goto out_put; - } -- err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); -+#endif -+ -+ /* BTW, it could be done once, before the retry loop */ -+ chargesize = skb_charge_fullsize(skb); -+ no_ubc = ub_sock_getwres_other(sk, chargesize); -+ no_buf = atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf || -+ test_bit(0, &nlk->state); -+ if (no_ubc || no_buf) { -+ wait_queue_head_t *sleep; -+ -+ if (!no_ubc) -+ ub_sock_retwres_other(sk, chargesize, -+ SOCK_MIN_UBCSPACE_CH); -+ err = -EAGAIN; -+ if (timeo == 0) { -+ kfree_skb(skb); -+ goto out_put; -+ } -+ -+ /* wake up comes to different queues */ -+ sleep = no_ubc ? sk->sk_sleep : &nlk->wait; -+ __set_current_state(TASK_INTERRUPTIBLE); -+ add_wait_queue(sleep, &wait); - -+ /* this if can't be moved upper because ub_sock_snd_queue_add() -+ * may change task state to TASK_RUNNING */ -+ if (no_ubc) -+ ub_sock_sndqueueadd_other(sk, chargesize); -+ -+ if ((atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf || -+ test_bit(0, &nlk->state) || no_ubc) && -+ !sock_flag(dstsk, SOCK_DEAD)) -+ timeo = schedule_timeout(timeo); -+ -+ __set_current_state(TASK_RUNNING); -+ remove_wait_queue(sleep, &wait); -+ if (no_ubc) -+ ub_sock_sndqueuedel(sk); -+ sock_put(dstsk); -+ -+ if (!signal_pending(current)) -+ goto retry; -+ err = sock_intr_errno(timeo); -+ goto out_free; -+ } -+ -+ skb_orphan(skb); -+ skb_set_owner_r(skb, dstsk); -+ ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); -+ skb_queue_tail(&dstsk->sk_receive_queue, skb); -+ dstsk->sk_data_ready(dstsk, len); -+ err = len; -+out_put: -+ sock_put(dstsk); - out: - return err; -+ -+out_free: -+ kfree_skb(skb); -+ return err; - } - - static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, -@@ -1303,6 +1376,10 @@ static int netlink_dump(struct sock *sk) - skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); - if (!skb) - return -ENOBUFS; -+ if (ub_nlrcvbuf_charge(skb, sk) < 0) { -+ kfree_skb(skb); -+ return -EACCES; -+ } - - spin_lock(&nlk->cb_lock); - -@@ -1471,8 +1548,15 @@ void netlink_run_queue(struct sock *sk, - *qlen = skb_queue_len(&sk->sk_receive_queue); - - for (; *qlen; (*qlen)--) { -+ int ret; -+ struct ve_struct *old_env; - skb = skb_dequeue(&sk->sk_receive_queue); -- if (netlink_rcv_skb(skb, cb)) { -+ -+ old_env = set_exec_env(VE_OWNER_SKB(skb)); -+ ret = netlink_rcv_skb(skb, cb); -+ (void)set_exec_env(old_env); -+ -+ if (ret) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, skb); - else { -@@ -1740,6 +1824,7 @@ enomem: - - sock_register(&netlink_family_ops); - #ifdef CONFIG_PROC_FS -+ /* FIXME: virtualize before give access from VEs */ - proc_net_fops_create("netlink", 0, &netlink_seq_fops); - #endif - /* The netlink device handler may be needed early. */ -diff -upr linux-2.6.16.orig/net/packet/af_packet.c linux-2.6.16-026test009/net/packet/af_packet.c ---- linux-2.6.16.orig/net/packet/af_packet.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/packet/af_packet.c 2006-04-19 15:02:12.000000000 +0400 -@@ -79,6 +79,8 @@ - #include <linux/module.h> - #include <linux/init.h> - -+#include <ub/ub_net.h> -+ - #ifdef CONFIG_INET - #include <net/inet_common.h> - #endif -@@ -280,7 +282,8 @@ static int packet_rcv_spkt(struct sk_buf - * so that this procedure is noop. - */ - -- if (skb->pkt_type == PACKET_LOOPBACK) -+ if (skb->pkt_type == PACKET_LOOPBACK || -+ !ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) - goto out; - - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) -@@ -472,6 +475,9 @@ static int packet_rcv(struct sk_buff *sk - sk = pt->af_packet_priv; - po = pkt_sk(sk); - -+ if (!ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) -+ goto drop; -+ - skb->dev = dev; - - if (dev->hard_header) { -@@ -531,6 +537,9 @@ static int packet_rcv(struct sk_buff *sk - if (pskb_trim(skb, snaplen)) - goto drop_n_acct; - -+ if (ub_sockrcvbuf_charge(sk, skb)) -+ goto drop_n_acct; -+ - skb_set_owner_r(skb, sk); - skb->dev = NULL; - dst_release(skb->dst); -@@ -581,6 +590,9 @@ static int tpacket_rcv(struct sk_buff *s - sk = pt->af_packet_priv; - po = pkt_sk(sk); - -+ if (!ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) -+ goto drop; -+ - if (dev->hard_header) { - if (sk->sk_type != SOCK_DGRAM) - skb_push(skb, skb->data - skb->mac.raw); -@@ -630,6 +642,12 @@ static int tpacket_rcv(struct sk_buff *s - if (snaplen > skb->len-skb->data_len) - snaplen = skb->len-skb->data_len; - -+ if (copy_skb && -+ ub_sockrcvbuf_charge(sk, copy_skb)) { -+ spin_lock(&sk->sk_receive_queue.lock); -+ goto ring_is_full; -+ } -+ - spin_lock(&sk->sk_receive_queue.lock); - h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head); - -@@ -1010,6 +1028,8 @@ static int packet_create(struct socket * - sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); - if (sk == NULL) - goto out; -+ if (ub_other_sock_charge(sk)) -+ goto out_free; - - sock->ops = &packet_ops; - #ifdef CONFIG_SOCK_PACKET -@@ -1048,6 +1068,9 @@ static int packet_create(struct socket * - sk_add_node(sk, &packet_sklist); - write_unlock_bh(&packet_sklist_lock); - return(0); -+ -+out_free: -+ sk_free(sk); - out: - return err; - } -@@ -1430,11 +1453,16 @@ static int packet_notifier(struct notifi - struct sock *sk; - struct hlist_node *node; - struct net_device *dev = (struct net_device*)data; -+ struct ve_struct *ve; - -+ ve = get_exec_env(); - read_lock(&packet_sklist_lock); - sk_for_each(sk, node, &packet_sklist) { - struct packet_sock *po = pkt_sk(sk); - -+ if (!ve_accessible_strict(VE_OWNER_SK(sk), ve)) -+ continue; -+ - switch (msg) { - case NETDEV_UNREGISTER: - #ifdef CONFIG_PACKET_MULTICAST -@@ -1845,6 +1873,8 @@ static inline struct sock *packet_seq_id - struct hlist_node *node; - - sk_for_each(s, node, &packet_sklist) { -+ if (!ve_accessible(VE_OWNER_SK(s), get_exec_env())) -+ continue; - if (!off--) - return s; - } -@@ -1860,9 +1890,13 @@ static void *packet_seq_start(struct seq - static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) - { - ++*pos; -- return (v == SEQ_START_TOKEN) -- ? sk_head(&packet_sklist) -- : sk_next((struct sock*)v) ; -+ do { -+ v = (v == SEQ_START_TOKEN) -+ ? sk_head(&packet_sklist) -+ : sk_next((struct sock*)v); -+ } while (v != NULL && -+ !ve_accessible(VE_OWNER_SK((struct sock*)v), get_exec_env())); -+ return v; - } - - static void packet_seq_stop(struct seq_file *seq, void *v) -@@ -1918,7 +1952,7 @@ static struct file_operations packet_seq - - static void __exit packet_exit(void) - { -- proc_net_remove("packet"); -+ remove_proc_glob_entry("net/packet", NULL); - unregister_netdevice_notifier(&packet_netdev_notifier); - sock_unregister(PF_PACKET); - proto_unregister(&packet_proto); -@@ -1933,7 +1967,7 @@ static int __init packet_init(void) - - sock_register(&packet_family_ops); - register_netdevice_notifier(&packet_netdev_notifier); -- proc_net_fops_create("packet", 0, &packet_seq_fops); -+ proc_glob_fops_create("net/packet", 0, &packet_seq_fops); - out: - return rc; - } -diff -upr linux-2.6.16.orig/net/sched/sch_generic.c linux-2.6.16-026test009/net/sched/sch_generic.c ---- linux-2.6.16.orig/net/sched/sch_generic.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/sched/sch_generic.c 2006-04-19 15:02:12.000000000 +0400 -@@ -97,6 +97,7 @@ int qdisc_restart(struct net_device *dev - - /* Dequeue packet */ - if ((skb = q->dequeue(q)) != NULL) { -+ struct ve_struct *envid; - unsigned nolock = (dev->features & NETIF_F_LLTX); - /* - * When the driver has LLTX set it does its own locking -@@ -107,6 +108,7 @@ int qdisc_restart(struct net_device *dev - * of lock congestion it should return -1 and the packet - * will be requeued. - */ -+ envid = set_exec_env(VE_OWNER_SKB(skb)); - if (!nolock) { - if (!spin_trylock(&dev->xmit_lock)) { - collision: -@@ -121,6 +123,7 @@ int qdisc_restart(struct net_device *dev - kfree_skb(skb); - if (net_ratelimit()) - printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); -+ (void)set_exec_env(envid); - return -1; - } - __get_cpu_var(netdev_rx_stat).cpu_collision++; -@@ -146,6 +149,7 @@ int qdisc_restart(struct net_device *dev - spin_unlock(&dev->xmit_lock); - } - spin_lock(&dev->queue_lock); -+ (void)set_exec_env(envid); - return -1; - } - if (ret == NETDEV_TX_LOCKED && nolock) { -@@ -177,6 +181,7 @@ int qdisc_restart(struct net_device *dev - requeue: - q->ops->requeue(skb, q); - netif_schedule(dev); -+ (void)set_exec_env(envid); - return 1; - } - BUG_ON((int) q->q.qlen < 0); -@@ -625,3 +630,4 @@ EXPORT_SYMBOL(qdisc_reset); - EXPORT_SYMBOL(qdisc_restart); - EXPORT_SYMBOL(qdisc_lock_tree); - EXPORT_SYMBOL(qdisc_unlock_tree); -+EXPORT_SYMBOL(dev_shutdown); -diff -upr linux-2.6.16.orig/net/sched/sch_teql.c linux-2.6.16-026test009/net/sched/sch_teql.c ---- linux-2.6.16.orig/net/sched/sch_teql.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/sched/sch_teql.c 2006-04-19 15:02:12.000000000 +0400 -@@ -189,6 +189,9 @@ static int teql_qdisc_init(struct Qdisc - struct teql_master *m = (struct teql_master*)sch->ops; - struct teql_sched_data *q = qdisc_priv(sch); - -+ if (!capable(CAP_NET_ADMIN)) -+ return -EPERM; -+ - if (dev->hard_header_len > m->dev->hard_header_len) - return -EINVAL; - -diff -upr linux-2.6.16.orig/net/socket.c linux-2.6.16-026test009/net/socket.c ---- linux-2.6.16.orig/net/socket.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/socket.c 2006-04-19 15:02:12.000000000 +0400 -@@ -84,6 +84,7 @@ - #include <linux/compat.h> - #include <linux/kmod.h> - #include <linux/audit.h> -+#include <linux/in.h> - - #ifdef CONFIG_NET_RADIO - #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ -@@ -1075,6 +1076,37 @@ int sock_wake_async(struct socket *sock, - return 0; - } - -+int vz_security_proto_check(int family, int type, int protocol) -+{ -+#ifdef CONFIG_VE -+ if (ve_is_super(get_exec_env())) -+ return 0; -+ -+ switch (family) { -+ case PF_UNSPEC: -+ case PF_PACKET: -+ case PF_NETLINK: -+ case PF_UNIX: -+ break; -+ case PF_INET: -+ switch (protocol) { -+ case IPPROTO_IP: -+ case IPPROTO_ICMP: -+ case IPPROTO_TCP: -+ case IPPROTO_UDP: -+ case IPPROTO_RAW: -+ break; -+ default: -+ return -EAFNOSUPPORT; -+ } -+ break; -+ default: -+ return -EAFNOSUPPORT; -+ } -+#endif -+ return 0; -+} -+ - static int __sock_create(int family, int type, int protocol, struct socket **res, int kern) - { - int err; -@@ -1102,6 +1134,11 @@ static int __sock_create(int family, int - family = PF_PACKET; - } - -+ /* VZ compatibility layer */ -+ err = vz_security_proto_check(family, type, protocol); -+ if (err < 0) -+ return err; -+ - err = security_socket_create(family, type, protocol, kern); - if (err) - return err; -diff -upr linux-2.6.16.orig/net/sunrpc/clnt.c linux-2.6.16-026test009/net/sunrpc/clnt.c ---- linux-2.6.16.orig/net/sunrpc/clnt.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/sunrpc/clnt.c 2006-04-19 15:02:12.000000000 +0400 -@@ -168,10 +168,10 @@ rpc_new_client(struct rpc_xprt *xprt, ch - } - - /* save the nodename */ -- clnt->cl_nodelen = strlen(system_utsname.nodename); -+ clnt->cl_nodelen = strlen(ve_utsname.nodename); - if (clnt->cl_nodelen > UNX_MAXNODENAME) - clnt->cl_nodelen = UNX_MAXNODENAME; -- memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); -+ memcpy(clnt->cl_nodename, ve_utsname.nodename, clnt->cl_nodelen); - return clnt; - - out_no_auth: -diff -upr linux-2.6.16.orig/net/sunrpc/sched.c linux-2.6.16-026test009/net/sunrpc/sched.c ---- linux-2.6.16.orig/net/sunrpc/sched.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/sunrpc/sched.c 2006-04-19 15:02:12.000000000 +0400 -@@ -605,7 +605,9 @@ EXPORT_SYMBOL(rpc_exit_task); - static int __rpc_execute(struct rpc_task *task) - { - int status = 0; -+ struct ve_struct *env; - -+ env = set_exec_env(get_ve0()); - dprintk("RPC: %4d rpc_execute flgs %x\n", - task->tk_pid, task->tk_flags); - -@@ -693,6 +695,7 @@ static int __rpc_execute(struct rpc_task - rpc_mark_complete_task(task); - /* Release all resources associated with the task */ - rpc_release_task(task); -+ (void)set_exec_env(env); - return status; - } - -diff -upr linux-2.6.16.orig/net/sunrpc/svcsock.c linux-2.6.16-026test009/net/sunrpc/svcsock.c ---- linux-2.6.16.orig/net/sunrpc/svcsock.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/sunrpc/svcsock.c 2006-04-19 15:02:12.000000000 +0400 -@@ -361,6 +361,9 @@ svc_sendto(struct svc_rqst *rqstp, struc - size_t base = xdr->page_base; - unsigned int pglen = xdr->page_len; - unsigned int flags = MSG_MORE; -+ struct ve_struct *old_env; -+ -+ old_env = set_exec_env(get_ve0()); - - slen = xdr->len; - -@@ -425,6 +428,8 @@ out: - rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, - rqstp->rq_addr.sin_addr.s_addr); - -+ (void)set_exec_env(old_env); -+ - return len; - } - -@@ -437,9 +442,12 @@ svc_recv_available(struct svc_sock *svsk - mm_segment_t oldfs; - struct socket *sock = svsk->sk_sock; - int avail, err; -+ struct ve_struct *old_env; - - oldfs = get_fs(); set_fs(KERNEL_DS); -+ old_env = set_exec_env(get_ve0()); - err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); -+ (void)set_exec_env(old_env); - set_fs(oldfs); - - return (err >= 0)? avail : err; -@@ -454,6 +462,7 @@ svc_recvfrom(struct svc_rqst *rqstp, str - struct msghdr msg; - struct socket *sock; - int len, alen; -+ struct ve_struct *old_env; - - rqstp->rq_addrlen = sizeof(rqstp->rq_addr); - sock = rqstp->rq_sock->sk_sock; -@@ -465,7 +474,9 @@ svc_recvfrom(struct svc_rqst *rqstp, str - - msg.msg_flags = MSG_DONTWAIT; - -+ old_env = set_exec_env(get_ve0()); - len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); -+ (void)set_exec_env(get_ve0()); - - /* sock_recvmsg doesn't fill in the name/namelen, so we must.. - * possibly we should cache this in the svc_sock structure -@@ -761,17 +772,19 @@ svc_tcp_accept(struct svc_sock *svsk) - const struct proto_ops *ops; - struct svc_sock *newsvsk; - int err, slen; -+ struct ve_struct *old_env; - - dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); - if (!sock) - return; - -+ old_env = set_exec_env(get_ve0()); - err = sock_create_lite(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock); - if (err) { - if (err == -ENOMEM) - printk(KERN_WARNING "%s: no more sockets!\n", - serv->sv_name); -- return; -+ goto restore; - } - - dprintk("svc: tcp_accept %p allocated\n", newsock); -@@ -865,6 +878,8 @@ svc_tcp_accept(struct svc_sock *svsk) - - } - -+ (void)set_exec_env(old_env); -+ - if (serv->sv_stats) - serv->sv_stats->nettcpconn++; - -@@ -872,6 +887,8 @@ svc_tcp_accept(struct svc_sock *svsk) - - failed: - sock_release(newsock); -+restore: -+ (void)set_exec_env(old_env); - return; - } - -@@ -1388,6 +1405,7 @@ svc_create_socket(struct svc_serv *serv, - struct socket *sock; - int error; - int type; -+ struct ve_struct *old_env; - - dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", - serv->sv_program->pg_name, protocol, -@@ -1401,8 +1419,10 @@ svc_create_socket(struct svc_serv *serv, - } - type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; - -+ old_env = set_exec_env(get_ve0()); -+ - if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0) -- return error; -+ goto restore; - - if (sin != NULL) { - if (type == SOCK_STREAM) -@@ -1418,12 +1438,16 @@ svc_create_socket(struct svc_serv *serv, - goto bummer; - } - -- if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) -+ if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) { -+ (void)set_exec_env(old_env); - return 0; -+ } - - bummer: - dprintk("svc: svc_create_socket error = %d\n", -error); - sock_release(sock); -+restore: -+ (void)set_exec_env(old_env); - return error; - } - -diff -upr linux-2.6.16.orig/net/unix/af_unix.c linux-2.6.16-026test009/net/unix/af_unix.c ---- linux-2.6.16.orig/net/unix/af_unix.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/unix/af_unix.c 2006-04-19 15:02:12.000000000 +0400 -@@ -118,6 +118,9 @@ - #include <net/checksum.h> - #include <linux/security.h> - -+#include <ub/ub_net.h> -+#include <ub/beancounter.h> -+ - int sysctl_unix_max_dgram_qlen = 10; - - struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -@@ -235,6 +238,8 @@ static struct sock *__unix_find_socket_b - sk_for_each(s, node, &unix_socket_table[hash ^ type]) { - struct unix_sock *u = unix_sk(s); - -+ if (!ve_accessible(VE_OWNER_SK(s), get_exec_env())) -+ continue; - if (u->addr->len == len && - !memcmp(u->addr->name, sunname, len)) - goto found; -@@ -439,7 +444,7 @@ static int unix_listen(struct socket *so - sk->sk_max_ack_backlog = backlog; - sk->sk_state = TCP_LISTEN; - /* set credentials so connect can copy them */ -- sk->sk_peercred.pid = current->tgid; -+ sk->sk_peercred.pid = virt_tgid(current); - sk->sk_peercred.uid = current->euid; - sk->sk_peercred.gid = current->egid; - err = 0; -@@ -553,6 +558,8 @@ static struct sock * unix_create1(struct - sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); - if (!sk) - goto out; -+ if (ub_other_sock_charge(sk)) -+ goto out_sk_free; - - atomic_inc(&unix_nr_socks); - -@@ -571,6 +578,9 @@ static struct sock * unix_create1(struct - unix_insert_socket(unix_sockets_unbound, sk); - out: - return sk; -+out_sk_free: -+ sk_free(sk); -+ return NULL; - } - - static int unix_create(struct socket *sock, int protocol) -@@ -676,7 +686,7 @@ static struct sock *unix_find_other(stru - err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); - if (err) - goto fail; -- err = vfs_permission(&nd, MAY_WRITE); -+ err = vfs_permission(&nd, MAY_WRITE, NULL); - if (err) - goto put_fail; - -@@ -932,6 +942,7 @@ static int unix_stream_connect(struct so - int st; - int err; - long timeo; -+ unsigned long chargesize; - - err = unix_mkname(sunaddr, addr_len, &hash); - if (err < 0) -@@ -960,6 +971,10 @@ static int unix_stream_connect(struct so - skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); - if (skb == NULL) - goto out; -+ chargesize = skb_charge_fullsize(skb); -+ if (ub_sock_getwres_other(newsk, chargesize) < 0) -+ goto out; -+ ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF); - - restart: - /* Find listening sock. */ -@@ -1043,7 +1058,7 @@ restart: - unix_peer(newsk) = sk; - newsk->sk_state = TCP_ESTABLISHED; - newsk->sk_type = sk->sk_type; -- newsk->sk_peercred.pid = current->tgid; -+ newsk->sk_peercred.pid = virt_tgid(current); - newsk->sk_peercred.uid = current->euid; - newsk->sk_peercred.gid = current->egid; - newu = unix_sk(newsk); -@@ -1107,7 +1122,7 @@ static int unix_socketpair(struct socket - sock_hold(skb); - unix_peer(ska)=skb; - unix_peer(skb)=ska; -- ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid; -+ ska->sk_peercred.pid = skb->sk_peercred.pid = virt_tgid(current); - ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid; - ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid; - -@@ -1433,6 +1448,16 @@ static int unix_stream_sendmsg(struct ki - - size=len-sent; - -+ if (msg->msg_flags & MSG_DONTWAIT) -+ ub_sock_makewres_other(sk, skb_charge_size(size)); -+ if (sock_bc(sk) != NULL && -+ sock_bc(sk)->poll_reserv >= -+ SOCK_MIN_UBCSPACE && -+ skb_charge_size(size) > -+ sock_bc(sk)->poll_reserv) -+ size = skb_charge_datalen(sock_bc(sk)->poll_reserv); -+ -+ - /* Keep two messages in the pipe so it schedules better */ - if (size > sk->sk_sndbuf / 2 - 64) - size = sk->sk_sndbuf / 2 - 64; -@@ -1444,7 +1469,8 @@ static int unix_stream_sendmsg(struct ki - * Grab a buffer - */ - -- skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); -+ skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE, -+ msg->msg_flags&MSG_DONTWAIT, &err); - - if (skb==NULL) - goto out_err; -@@ -1869,6 +1895,7 @@ static unsigned int unix_poll(struct fil - { - struct sock *sk = sock->sk; - unsigned int mask; -+ int no_ub_res; - - poll_wait(file, sk->sk_sleep, wait); - mask = 0; -@@ -1879,6 +1906,10 @@ static unsigned int unix_poll(struct fil - if (sk->sk_shutdown == SHUTDOWN_MASK) - mask |= POLLHUP; - -+ no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); -+ if (no_ub_res) -+ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); -+ - /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) -@@ -1892,7 +1923,7 @@ static unsigned int unix_poll(struct fil - * we set writable also when the other side has shut down the - * connection. This prevents stuck sockets. - */ -- if (unix_writable(sk)) -+ if (!no_ub_res && unix_writable(sk)) - mask |= POLLOUT | POLLWRNORM | POLLWRBAND; - - return mask; -@@ -2044,7 +2075,7 @@ static int __init af_unix_init(void) - - sock_register(&unix_family_ops); - #ifdef CONFIG_PROC_FS -- proc_net_fops_create("unix", 0, &unix_seq_fops); -+ proc_glob_fops_create("net/unix", 0, &unix_seq_fops); - #endif - unix_sysctl_register(); - out: -@@ -2055,7 +2086,7 @@ static void __exit af_unix_exit(void) - { - sock_unregister(PF_UNIX); - unix_sysctl_unregister(); -- proc_net_remove("unix"); -+ remove_proc_glob_entry("net/unix", NULL); - proto_unregister(&unix_proto); - } - -diff -upr linux-2.6.16.orig/net/unix/garbage.c linux-2.6.16-026test009/net/unix/garbage.c ---- linux-2.6.16.orig/net/unix/garbage.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/net/unix/garbage.c 2006-04-19 15:02:13.000000000 +0400 -@@ -76,6 +76,7 @@ - #include <linux/netdevice.h> - #include <linux/file.h> - #include <linux/proc_fs.h> -+#include <linux/module.h> - - #include <net/sock.h> - #include <net/af_unix.h> -@@ -135,7 +136,7 @@ void unix_notinflight(struct file *fp) - atomic_dec(&unix_tot_inflight); - } - } -- -+EXPORT_SYMBOL_GPL(unix_notinflight); - - /* - * Garbage Collector Support Functions -diff -upr linux-2.6.16.orig/security/commoncap.c linux-2.6.16-026test009/security/commoncap.c ---- linux-2.6.16.orig/security/commoncap.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/security/commoncap.c 2006-04-19 15:02:12.000000000 +0400 -@@ -35,7 +35,7 @@ EXPORT_SYMBOL(cap_netlink_send); - - int cap_netlink_recv(struct sk_buff *skb) - { -- if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) -+ if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_VE_NET_ADMIN)) - return -EPERM; - return 0; - } -@@ -197,7 +197,7 @@ int cap_inode_setxattr(struct dentry *de - { - if (!strncmp(name, XATTR_SECURITY_PREFIX, - sizeof(XATTR_SECURITY_PREFIX) - 1) && -- !capable(CAP_SYS_ADMIN)) -+ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) - return -EPERM; - return 0; - } -@@ -206,7 +206,7 @@ int cap_inode_removexattr(struct dentry - { - if (!strncmp(name, XATTR_SECURITY_PREFIX, - sizeof(XATTR_SECURITY_PREFIX) - 1) && -- !capable(CAP_SYS_ADMIN)) -+ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) - return -EPERM; - return 0; - } -@@ -312,7 +312,7 @@ void cap_task_reparent_to_init (struct t - - int cap_syslog (int type) - { -- if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) -+ if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN)) - return -EPERM; - return 0; - } -diff -upr linux-2.6.16.orig/security/keys/key.c linux-2.6.16-026test009/security/keys/key.c ---- linux-2.6.16.orig/security/keys/key.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/security/keys/key.c 2006-04-19 15:02:11.000000000 +0400 -@@ -785,6 +785,10 @@ key_ref_t key_create_or_update(key_ref_t - - key_check(keyring); - -+ key_ref = ERR_PTR(-ENOTDIR); -+ if (keyring->type != &key_type_keyring) -+ goto error_2; -+ - down_write(&keyring->sem); - - /* if we're going to allocate a new key, we're going to have -diff -upr linux-2.6.16.orig/security/keys/keyring.c linux-2.6.16-026test009/security/keys/keyring.c ---- linux-2.6.16.orig/security/keys/keyring.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/security/keys/keyring.c 2006-04-19 15:02:11.000000000 +0400 -@@ -437,6 +437,7 @@ EXPORT_SYMBOL(keyring_search); - /* - * search the given keyring only (no recursion) - * - keyring must be locked by caller -+ * - caller must guarantee that the keyring is a keyring - */ - key_ref_t __keyring_search_one(key_ref_t keyring_ref, - const struct key_type *ktype, -diff -upr linux-2.6.16.orig/security/selinux/hooks.c linux-2.6.16-026test009/security/selinux/hooks.c ---- linux-2.6.16.orig/security/selinux/hooks.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/security/selinux/hooks.c 2006-04-19 15:02:12.000000000 +0400 -@@ -4167,12 +4167,12 @@ static int selinux_setprocattr(struct ta - struct task_struct *g, *t; - struct mm_struct *mm = p->mm; - read_lock(&tasklist_lock); -- do_each_thread(g, t) -+ do_each_thread_ve(g, t) - if (t->mm == mm && t != p) { - read_unlock(&tasklist_lock); - return -EPERM; - } -- while_each_thread(g, t); -+ while_each_thread_ve(g, t); - read_unlock(&tasklist_lock); - } - -diff -upr linux-2.6.16.orig/sound/isa/opti9xx/opti92x-ad1848.c linux-2.6.16-026test009/sound/isa/opti9xx/opti92x-ad1848.c ---- linux-2.6.16.orig/sound/isa/opti9xx/opti92x-ad1848.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/sound/isa/opti9xx/opti92x-ad1848.c 2006-04-19 15:02:11.000000000 +0400 -@@ -2088,9 +2088,11 @@ static int __init alsa_card_opti9xx_init - int error; - struct platform_device *device; - -+#ifdef CONFIG_PNP - pnp_register_card_driver(&opti9xx_pnpc_driver); - if (snd_opti9xx_pnp_is_probed) - return 0; -+#endif - if (! is_isapnp_selected()) { - error = platform_driver_register(&snd_opti9xx_driver); - if (error < 0) -@@ -2102,7 +2104,9 @@ static int __init alsa_card_opti9xx_init - } - platform_driver_unregister(&snd_opti9xx_driver); - } -+#ifdef CONFIG_PNP - pnp_unregister_card_driver(&opti9xx_pnpc_driver); -+#endif - #ifdef MODULE - printk(KERN_ERR "no OPTi " CHIP_NAME " soundcard found\n"); - #endif -@@ -2115,7 +2119,9 @@ static void __exit alsa_card_opti9xx_exi - platform_device_unregister(snd_opti9xx_platform_device); - platform_driver_unregister(&snd_opti9xx_driver); - } -+#ifdef CONFIG_PNP - pnp_unregister_card_driver(&opti9xx_pnpc_driver); -+#endif - } - - module_init(alsa_card_opti9xx_init) -diff -upr linux-2.6.16.orig/sound/pci/hda/patch_realtek.c linux-2.6.16-026test009/sound/pci/hda/patch_realtek.c ---- linux-2.6.16.orig/sound/pci/hda/patch_realtek.c 2006-04-19 15:02:02.000000000 +0400 -+++ linux-2.6.16-026test009/sound/pci/hda/patch_realtek.c 2006-04-19 15:02:11.000000000 +0400 -@@ -2948,6 +2948,8 @@ static struct hda_board_config alc260_cf - { .modelname = "basic", .config = ALC260_BASIC }, - { .pci_subvendor = 0x104d, .pci_subdevice = 0x81bb, - .config = ALC260_BASIC }, /* Sony VAIO */ -+ { .pci_subvendor = 0x152d, .pci_subdevice = 0x0729, -+ .config = ALC260_BASIC }, /* CTL Travel Master U553W */ - { .modelname = "hp", .config = ALC260_HP }, - { .pci_subvendor = 0x103c, .pci_subdevice = 0x3010, .config = ALC260_HP }, - { .pci_subvendor = 0x103c, .pci_subdevice = 0x3011, .config = ALC260_HP }, |